From 8e3b23cd430f21524e3ad72f66fc9703c5035e40 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 30 Oct 2019 14:58:12 +0100
Subject: [PATCH 001/633] adapt classifier #1 (not working)

---
 .../embedding_intent_classifier.py            | 571 ++++++++++++++----
 rasa/nlu/tokenizers/tokenizer.py              |   3 +
 rasa/utils/train_utils.py                     |  30 +-
 3 files changed, 487 insertions(+), 117 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7d68232f74d2..a403621083e3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -7,9 +7,12 @@
 from typing import Any, Dict, List, Optional, Text, Tuple, Union
 import warnings
 
-from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
+from tf_metrics import f1
+
+from nlu.extractors import EntityExtractor
+from nlu.test import determine_token_labels
+from nlu.tokenizers.tokenizer import Token
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
-from rasa.nlu.components import Component
 from rasa.utils import train_utils
 from rasa.utils.train_utils import SessionData
 from rasa.nlu.constants import (
@@ -17,6 +20,8 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_ENTITIES_ATTRIBUTE,
 )
 
 import tensorflow as tf
@@ -33,7 +38,7 @@
     from rasa.nlu.training_data import Message
 
 
-class EmbeddingIntentClassifier(Component):
+class EmbeddingIntentClassifier(EntityExtractor):
     """Intent classifier using supervised embeddings.
 
     The embedding intent classifier embeds user inputs
@@ -52,9 +57,9 @@ class EmbeddingIntentClassifier(Component):
     and additional hidden layers are added together with dropout.
     """
 
-    provides = ["intent", "intent_ranking"]
+    provides = ["intent", "intent_ranking", "entities"]
 
-    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
@@ -65,8 +70,21 @@ class EmbeddingIntentClassifier(Component):
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
         "hidden_layers_sizes_b": [],
+        # sizes of hidden layers before the embedding layer for tag labels
+        # the number of hidden layers is thus equal to the length of this list
+        "hidden_layers_sizes_c": [],
         # Whether to share the hidden layer weights between input words and labels
         "share_hidden_layers": False,
+        # number of units in transformer
+        "transformer_size": 128,
+        # number of transformer layers
+        "num_transformer_layers": 1,
+        # number of attention heads in transformer
+        "num_heads": 4,
+        # type of positional encoding in transformer
+        "pos_encoding": "timing",  # string 'timing' or 'emb'
+        # max sequence length if pos_encoding='emb'
+        "max_seq_length": 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -103,11 +121,18 @@ class EmbeddingIntentClassifier(Component):
         "C_emb": 0.8,
         # dropout rate for rnn
         "droprate": 0.2,
+        # use a unidirectional or bidirectional encoder
+        "unidirectional_encoder": True,
         # visualization of accuracy
         # how often to calculate training accuracy
         "evaluate_every_num_epochs": 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
         "evaluate_on_num_examples": 0,  # large values may hurt performance
+        # model config
+        # if true intent classification is trained and intent predicted
+        "intent_classification": True,
+        # if true named entity recognition is trained and entities predicted
+        "named_entity_recognition": True,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -115,39 +140,47 @@ def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
         inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
         session: Optional["tf.Session"] = None,
         graph: Optional["tf.Graph"] = None,
         message_placeholder: Optional["tf.Tensor"] = None,
         label_placeholder: Optional["tf.Tensor"] = None,
+        tag_placeholder: Optional["tf.Tensor"] = None,
         similarity_all: Optional["tf.Tensor"] = None,
-        pred_confidence: Optional["tf.Tensor"] = None,
+        intent_prediction: Optional["tf.Tensor"] = None,
+        tag_prediction: Optional["tf.Tensor"] = None,
         similarity: Optional["tf.Tensor"] = None,
-        message_embed: Optional["tf.Tensor"] = None,
         label_embed: Optional["tf.Tensor"] = None,
         all_labels_embed: Optional["tf.Tensor"] = None,
+        attention_weights: Optional["tf.Tensor"] = None,
     ) -> None:
         """Declare instant variables with default values"""
-
         super(EmbeddingIntentClassifier, self).__init__(component_config)
 
         self._load_params()
 
         # transform numbers to labels
         self.inverted_label_dict = inverted_label_dict
+        # transform numbers to tags
+        self.inverted_tag_dict = inverted_tag_dict
         # encode all label_ids with numbers
         self._encoded_all_label_ids = None
+        # encode all tag_ids with numbers
+        self._encoded_all_tag_ids = None
 
         # tf related instances
         self.session = session
         self.graph = graph
         self.a_in = message_placeholder
         self.b_in = label_placeholder
+        self.c_in = tag_placeholder
         self.sim_all = similarity_all
-        self.pred_confidence = pred_confidence
+        self.intent_prediction = intent_prediction
+        self.entity_prediction = tag_prediction
         self.sim = similarity
+        self.attention_weights = attention_weights
 
         # persisted embeddings
-        self.message_embed = message_embed
         self.label_embed = label_embed
         self.all_labels_embed = all_labels_embed
 
@@ -177,6 +210,7 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.hidden_layer_sizes = {
             "a": config["hidden_layers_sizes_a"],
             "b": config["hidden_layers_sizes_b"],
+            "c": config["hidden_layers_sizes_c"],
         }
         self.share_hidden_layers = config["share_hidden_layers"]
         if (
@@ -195,6 +229,13 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
 
         self.random_seed = self.component_config["random_seed"]
 
+        self.transformer_size = self.component_config["transformer_size"]
+        self.num_transformer_layers = self.component_config["num_transformer_layers"]
+        self.num_heads = self.component_config["num_heads"]
+        self.pos_encoding = self.component_config["pos_encoding"]
+        self.max_seq_length = self.component_config["max_seq_length"]
+        self.unidirectional_encoder = self.component_config["unidirectional_encoder"]
+
     def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
         self.embed_dim = config["embed_dim"]
         self.num_neg = config["num_neg"]
@@ -233,6 +274,11 @@ def _load_params(self) -> None:
         self._load_regularization_params(self.component_config)
         self._load_visual_params(self.component_config)
 
+        self.intent_classification = self.component_config["intent_classification"]
+        self.named_entity_recognition = self.component_config[
+            "named_entity_recognition"
+        ]
+
     # package safety checks
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -252,6 +298,25 @@ def _create_label_id_dict(
             label_id: idx for idx, label_id in enumerate(sorted(distinct_label_ids))
         }
 
+    @staticmethod
+    def _create_tag_id_dict(
+        training_data: "TrainingData", attribute: Text
+    ) -> Dict[Text, int]:
+        """Create label_id dictionary"""
+
+        distinct_tag_ids = set(
+            [
+                e["entity"]
+                for example in training_data.entity_examples
+                for e in example.get(attribute)
+            ]
+        ) - {None}
+        tag_id_dict = {
+            tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
+        }
+        tag_id_dict["O"] = 0
+        return tag_id_dict
+
     @staticmethod
     def _find_example_for_label(
         label: Text, examples: List["Message"], attribute: Text
@@ -261,6 +326,14 @@ def _find_example_for_label(
                 return ex
         return None
 
+    @staticmethod
+    def _find_example_for_tag(tag, examples, attribute):
+        for ex in examples:
+            for e in ex.get(attribute):
+                if e["entity"] == tag:
+                    return ex
+        return None
+
     @staticmethod
     def _check_labels_features_exist(
         labels_example: List[Tuple[int, "Message"]], attribute_feature_name: Text
@@ -278,12 +351,7 @@ def _extract_labels_precomputed_features(
 
         # Collect precomputed encodings
         encoded_id_labels = [
-            (
-                label_idx,
-                sequence_to_sentence_features(label_example.get(attribute_feature_name))
-                .toarray()
-                .squeeze(),
-            )
+            (label_idx, label_example.get(attribute_feature_name))
             for (label_idx, label_example) in label_examples
         ]
 
@@ -308,9 +376,11 @@ def _create_encoded_label_ids(
         attribute: Text,
         attribute_feature_name: Text,
     ) -> np.ndarray:
-        """Create matrix with label_ids encoded in rows as bag of words. If the features are already computed, fetch
-        them from the message object else compute a one hot encoding for the label as the feature vector
-        Find a training example for each label and get the encoded features from the corresponding Message object"""
+        """Create matrix with label_ids encoded in rows as bag of words. If the
+        features are already computed, fetch them from the message object else compute
+        a one hot encoding for the label as the feature vector. Find a training example
+        for each label and get the encoded features from the corresponding Message
+        object."""
 
         labels_example = []
 
@@ -327,24 +397,56 @@ def _create_encoded_label_ids(
                 labels_example, attribute_feature_name
             )
         else:
-            encoded_id_labels = self._compute_default_label_features(labels_example)
+            features = self._compute_default_label_features(labels_example)
+            encoded_id_labels = [scipy.sparse.csr_matrix(f) for f in features]
+            encoded_id_labels = np.array(encoded_id_labels)
 
         return encoded_id_labels
 
+    def _create_encoded_tag_ids(
+        self,
+        training_data: "TrainingData",
+        tag_id_dict: Dict[Text, int],
+        attribute: Text,
+    ) -> np.ndarray:
+        """Create matrix with tag_ids encoded in rows as bag of words. If the features
+        are already computed, fetch them from the message object else compute a one
+        hot encoding for the label as the feature vector.
+        Find a training example for each tag and get the encoded features from the
+        corresponding Message object."""
+
+        tags_example = []
+
+        # Collect one example for each label
+        for tag_name, idx in tag_id_dict.items():
+            tag_example = self._find_example_for_tag(
+                tag_name, training_data.entity_examples, attribute
+            )
+            tags_example.append((idx, tag_example))
+
+        # Collect features, precomputed if they exist, else compute on the fly
+        features = self._compute_default_label_features(tags_example)
+        encoded_id_tags = [scipy.sparse.csr_matrix(f) for f in features]
+        encoded_id_tags = np.array(encoded_id_tags)
+
+        return encoded_id_tags
+
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
+        tag_id_dict: Dict[Text, int],
         attribute: Text,
     ) -> "SessionData":
         """Prepare data for training and create a SessionData object"""
         X_sparse = []
         X_dense = []
         Y = []
-        label_ids = []
+        intent_ids = []
+        tag_ids = []
 
-        for e in training_data.intent_examples:
+        for e in training_data.training_examples:
             if e.get(attribute):
                 x_sparse, x_dense = self._get_x_features(e)
 
@@ -353,13 +455,23 @@ def _create_session_data(
                 if x_dense is not None:
                     X_dense.append(x_dense)
 
-                label_ids.append(label_id_dict[e.get(attribute)])
+                intent_ids.append(label_id_dict[e.get(attribute)])
+
+        for e in training_data.training_examples:
+            _tags = []
+            for t in e.get("tokens"):
+                _tag = determine_token_labels(
+                    t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
+                )
+                _tags.append(tag_id_dict[_tag])
+            tag_ids.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
-        label_ids = np.array(label_ids)
+        intent_ids = np.array(intent_ids)
+        tag_ids = np.array(tag_ids)
 
-        for label_id_idx in label_ids:
+        for label_id_idx in intent_ids:
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
@@ -369,7 +481,11 @@ def _create_session_data(
         if X_dense.size > 0:
             X_dict["text_features_dense"] = X_dense
 
-        return SessionData(X_dict, {"intent_features": Y}, {"intent_ids": label_ids})
+        return SessionData(
+            X_dict,
+            {"intent_features": Y},
+            {"intent_ids": intent_ids, "tag_ids": tag_ids},
+        )
 
     def _get_x_features(
         self, message: "Message"
@@ -384,16 +500,16 @@ def _get_x_features(
             message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             is not None
         ):
-            x_sparse = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            x_sparse = message.get(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             )
 
         if (
             message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             is not None
         ):
-            x_dense = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            x_dense = message.get(
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             )
 
         return x_sparse, x_dense
@@ -424,37 +540,98 @@ def _create_tf_embed_fnn(
             layer_name_suffix=embed_name,
         )
 
-    def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
+    def _build_tf_train_graph(
+        self
+    ) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
+        intent_loss = tf.constant(0.0)
+        intent_metric = tf.constant(0.0)
+        entity_loss = tf.constant(0.0)
+        entity_metric = tf.constant(0.0)
+
         # batch = 1 or 2 a_in values, b_in, intent_ids
         batch = self._iterator.get_next()
-        self.a_in, self.b_in = self.batch_to_input(batch)
+        self.a_in, self.b_in, self.c_in = self.batch_to_input(batch)
+
+        # transformer
+        a, mask = self._create_tf_sequence(self.a_in)
+
+        if self.intent_classification:
+            intent_loss, intent_metric = self.build_intent_train_graph(a, mask)
+
+        if self.named_entity_recognition:
+            entity_loss, entity_metric = self.build_entity_train_graph(a, mask)
 
-        all_label_ids = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
+        return intent_loss, intent_metric, entity_loss, entity_metric
+
+    def build_entity_train_graph(self, a, mask):
+        # get sequence lengths for NER
+        sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
+        if len(sequence_lengths.shape) > 1:
+            sequence_lengths = tf.squeeze(sequence_lengths)
+
+        sequence_lengths.set_shape([mask.shape[0]])
+
+        # shape: batch-size, seq-len, dim
+        self.c_in = tf.reduce_sum(tf.nn.relu(self.c_in), -1)
+
+        # CRF
+        crf_params, logits, pred_ids = self._create_crf(a, sequence_lengths)
+
+        # Loss
+        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
+            logits, self.c_in, sequence_lengths, crf_params
         )
+        loss = tf.reduce_mean(-log_likelihood)
 
-        self.message_embed = self._create_tf_embed_fnn(
-            self.a_in,
-            self.hidden_layer_sizes["a"],
-            fnn_name="a_b" if self.share_hidden_layers else "a",
-            embed_name="a",
+        pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
+
+        # Metrics
+        weights = tf.sequence_mask(sequence_lengths)
+        num_tags = len(self.inverted_tag_dict)
+        metric = f1(self.c_in, pred_ids, num_tags, pos_tag_indices, weights)[1]
+
+        return loss, metric
+
+    def build_intent_train_graph(
+        self, a: tf.Tensor, mask: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        last = tf.expand_dims(last, -1)
+
+        # get _cls_ vector for intent classification
+        cls_embed = tf.reduce_sum(a * last, 1)
+        cls_embed = train_utils.create_tf_embed(
+            cls_embed,
+            self.embed_dim,
+            self.C2,
+            self.similarity_type,
+            layer_name_suffix="a",
         )
 
-        self.label_embed = self._create_tf_embed_fnn(
-            self.b_in,
+        # all_label_ids is tensor of label-count x 1 x feature-len for labels
+        all_label_ids = tf.sparse_tensor_to_dense(
+            self._encoded_all_label_ids, name="all_labels_raw"
+        )
+        all_label_ids = tf.reduce_sum(tf.nn.relu(all_label_ids), 1)
+
+        self.all_labels_embed = self._create_tf_embed_fnn(
+            all_label_ids,
             self.hidden_layer_sizes["b"],
             fnn_name="a_b" if self.share_hidden_layers else "b",
             embed_name="b",
         )
-        self.all_labels_embed = self._create_tf_embed_fnn(
-            all_label_ids,
+
+        self.b_in = tf.reduce_sum(tf.nn.relu(self.b_in), 1)
+
+        self.label_embed = self._create_tf_embed_fnn(
+            self.b_in,
             self.hidden_layer_sizes["b"],
             fnn_name="a_b" if self.share_hidden_layers else "b",
             embed_name="b",
         )
 
-        return train_utils.calculate_loss_acc(
-            self.message_embed,
+        intent_loss, intent_metric = train_utils.calculate_loss_acc(
+            cls_embed,
             self.label_embed,
             self.b_in,
             self.all_labels_embed,
@@ -468,37 +645,81 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
             self.C_emb,
             self.scale_loss,
         )
+        return intent_loss, intent_metric
+
+    def _create_crf(
+        self, input: tf.Tensor, sequence_lengths: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        num_tags = len(self.inverted_tag_dict)
+
+        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+            logits = tf.layers.dense(input, num_tags, name="crf-logits")
+            crf_params = tf.get_variable(
+                "crf-params", [num_tags, num_tags], dtype=tf.float32
+            )
+            pred_ids, _ = tf.contrib.crf.crf_decode(
+                logits, crf_params, sequence_lengths
+            )
+
+            return crf_params, logits, pred_ids
+
+    def _create_tf_sequence(self, a_in: tf.Tensor) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Create sequence level embedding and mask."""
+        # mask different length sequences
+        # if there is at least one `-1` it should be masked
+        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+
+        a_in = train_utils.create_tf_fnn(
+            a_in,
+            self.hidden_layer_sizes["a"],
+            self.droprate,
+            self.C2,
+            self._is_training,
+            layer_name_suffix="a",
+        )
+        self.attention_weights = {}
+        hparams = train_utils.create_t2t_hparams(
+            self.num_transformer_layers,
+            self.transformer_size,
+            self.num_heads,
+            self.droprate,
+            self.pos_encoding,
+            self.max_seq_length,
+            self._is_training,
+            self.unidirectional_encoder,
+        )
+
+        a = train_utils.create_t2t_transformer_encoder(
+            a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
+        )
 
-    def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor]:
+        return a, mask
+
+    def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Convert batch input into correct tensors.
 
         As we do not know what features (sparse and/or dense) were used, we need to
         check what features are provided and parse them accordingly.
         """
-        # batch contains 1 or 2 a_in values, b_in, label_ids
-        b_in = batch[-2]
-
-        if len(batch) == 3:
-            a_in = self._squeeze_sparse_features(batch[0])
-            return a_in, b_in
+        # batch contains 1 or 2 a_in values, b_in, label_ids, tag_ids
+        b_in = batch[-3]
+        c_in = batch[-1]
 
         if len(batch) == 4:
-            a_in_1 = self._squeeze_sparse_features(batch[0])
-            a_in_2 = self._squeeze_sparse_features(batch[1])
+            a_in = batch[0]
+            return a_in, b_in, c_in
+
+        if len(batch) == 5:
+            a_in_1 = batch[0]
+            a_in_2 = batch[1]
             # Concatenate a_in features
+            # TODO should not be just concatenated
             a_in = tf.concat([a_in_1, a_in_2], axis=1)
 
-            return a_in, b_in
+            return a_in, b_in, c_in
 
         raise ValueError("Iterator return unexpected number of tensors.")
 
-    def _squeeze_sparse_features(self, a_in: tf.Tensor) -> tf.Tensor:
-        # as sparse features come from a scipy.sparse.csr_matrix they have an
-        # additional dimension
-        if len(a_in.shape) == 3:
-            a_in = tf.squeeze(a_in, axis=1)
-        return a_in
-
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         num_features_sparse = self._get_num_of_features(
             session_data, "text_features_sparse"
@@ -515,32 +736,60 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
             (None, None, session_data.Y["intent_features"][0].shape[-1]),
             name="b",
         )
-
-        self.message_embed = self._create_tf_embed_fnn(
-            self.a_in,
-            self.hidden_layer_sizes["a"],
-            fnn_name="a_b" if self.share_hidden_layers else "a",
-            embed_name="a",
+        self.c_in = tf.placeholder(
+            tf.int64,
+            (None, None, session_data.labels["tag_ids"][0].shape[-1]),
+            name="c",
         )
 
-        self.sim_all = train_utils.tf_raw_sim(
-            self.message_embed[:, tf.newaxis, :],
-            self.all_labels_embed[tf.newaxis, :, :],
-            None,
-        )
+        a, mask = self._create_tf_sequence(self.a_in)
 
-        self.label_embed = self._create_tf_embed_fnn(
-            self.b_in,
-            self.hidden_layer_sizes["b"],
-            fnn_name="a_b" if self.share_hidden_layers else "b",
-            embed_name="b",
-        )
+        if self.intent_classification:
+            last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+            last = tf.expand_dims(last, -1)
 
-        self.sim = train_utils.tf_raw_sim(
-            self.message_embed[:, tf.newaxis, :], self.label_embed, None
-        )
+            # get _cls_ embedding
+            cls_embed = tf.reduce_sum(a * last, 1)
+            cls_embed = train_utils.create_tf_embed(
+                cls_embed,
+                self.embed_dim,
+                self.C2,
+                self.similarity_type,
+                layer_name_suffix="a",
+            )
 
-        return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
+            # reduce dimensionality as input should not be sequence for intent
+            # classification
+            self.b_in = tf.reduce_sum(self.b_in, 1)
+
+            self.sim_all = train_utils.tf_raw_sim(
+                cls_embed[:, tf.newaxis, :],
+                self.all_labels_embed[tf.newaxis, :, :],
+                None,
+            )
+
+            self.label_embed = self._create_tf_embed_fnn(
+                self.b_in,
+                self.hidden_layer_sizes["b"],
+                fnn_name="a_b" if self.share_hidden_layers else "b",
+                embed_name="b",
+            )
+
+            # predict intents
+            self.sim = train_utils.tf_raw_sim(
+                cls_embed[:, tf.newaxis, :], self.label_embed, None
+            )
+            self.intent_prediction = train_utils.confidence_from_sim(
+                self.sim_all, self.similarity_type
+            )
+
+        if self.named_entity_recognition:
+            # get sequence lengths for NER
+            sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
+
+            # predict tags
+            _, _, pred_ids = self._create_crf(a, sequence_lengths)
+            self.entity_prediction = tf.to_int64(pred_ids)
 
     def _get_num_of_features(self, session_data: "SessionData", x_key: Text) -> int:
         return session_data.X[x_key][0].shape[-1] if x_key in session_data.X else 0
@@ -571,8 +820,8 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
-
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
+
         self._encoded_all_label_ids = self._create_encoded_label_ids(
             training_data,
             label_id_dict,
@@ -582,6 +831,15 @@ def preprocess_train_data(self, training_data: "TrainingData"):
             ],
         )
 
+        tag_id_dict = self._create_tag_id_dict(
+            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
+        )
+        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
+
+        self._encoded_all_tag_ids = self._create_encoded_tag_ids(
+            training_data, tag_id_dict, attribute=MESSAGE_ENTITIES_ATTRIBUTE
+        )
+
         # check if number of negatives is less than number of label_ids
         logger.debug(
             "Check if num_neg {} is smaller than "
@@ -593,7 +851,10 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
 
         session_data = self._create_session_data(
-            training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
+            training_data,
+            label_id_dict,
+            tag_id_dict,
+            attribute=MESSAGE_INTENT_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
@@ -618,15 +879,16 @@ def train(
 
         session_data = self.preprocess_train_data(training_data)
 
-        possible_to_train = self._check_enough_labels(session_data)
+        if self.intent_classification:
+            possible_to_train = self._check_enough_labels(session_data)
 
-        if not possible_to_train:
-            logger.error(
-                "Can not train a classifier. "
-                "Need at least 2 different classes. "
-                "Skipping training of classifier."
-            )
-            return
+            if not possible_to_train:
+                logger.error(
+                    "Can not train a classifier. "
+                    "Need at least 2 different classes. "
+                    "Skipping training of classifier."
+                )
+                return
 
         if self.evaluate_on_num_examples:
             session_data, eval_session_data = train_utils.train_val_split(
@@ -660,19 +922,24 @@ def train(
 
             self._is_training = tf.placeholder_with_default(False, shape=())
 
-            loss, acc = self._build_tf_train_graph()
+            intent_loss, intent_acc, entity_loss, entity_f1 = (
+                self._build_tf_train_graph()
+            )
+
+            loss = intent_loss + entity_loss
 
             # define which optimizer to use
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
+            # TODO proper loss, acc handling
             train_utils.train_tf_dataset(
                 train_init_op,
                 eval_init_op,
                 batch_size_in,
                 loss,
-                acc,
+                intent_acc,
                 self._train_op,
                 self.session,
                 self._is_training,
@@ -683,14 +950,18 @@ def train(
             )
 
             # rebuild the graph for prediction
-            self.pred_confidence = self._build_tf_pred_graph(session_data)
+            self._build_tf_pred_graph(session_data)
+
+            self.attention_weights = train_utils.extract_attention(
+                self.attention_weights
+            )
 
     # process helpers
     # noinspection PyPep8Naming
     def _calculate_message_sim(self, X: np.ndarray) -> Tuple[np.ndarray, List[float]]:
         """Calculate message similarities"""
 
-        message_sim = self.session.run(self.pred_confidence, feed_dict={self.a_in: X})
+        message_sim = self.session.run(self.intent_prediction, feed_dict={self.a_in: X})
 
         message_sim = message_sim.flatten()  # sim is a matrix
 
@@ -740,7 +1011,7 @@ def _extract_features(self, message: "Message") -> np.ndarray:
         x_sparse, x_dense = self._get_x_features(message)
 
         if x_sparse is not None:
-            x_sparse = x_sparse.toarray().squeeze().reshape(1, -1)
+            x_sparse = x_sparse.toarray().reshape(1, -1)
 
         if x_dense is not None:
             x_dense = x_dense.reshape(1, -1)
@@ -756,13 +1027,78 @@ def _extract_features(self, message: "Message") -> np.ndarray:
 
         raise ValueError("No features found for X.")
 
+    def predict_entities(self, message: "Message") -> List[Dict]:
+        if self.session is None:
+            logger.error(
+                "There is no trained tf.session: "
+                "component is either not trained or "
+                "didn't receive enough training data"
+            )
+        else:
+            # get features (bag of words) for a message
+            # noinspection PyPep8Naming
+            X = self._extract_features(message)
+
+            # load tf graph and session
+            predictions = self.session.run(
+                self.entity_prediction, feed_dict={self.a_in: X}
+            )
+
+            tags = [self.inverted_tag_dict[p] for p in predictions[0]]
+
+            entities = self._convert_tags_to_entities(
+                message.text, message.get("tokens", []), tags
+            )
+
+            extracted = self.add_extractor_name(entities)
+            entities = message.get("entities", []) + extracted
+
+            return entities
+
+    def _convert_tags_to_entities(
+        self, text: str, tokens: List[Token], tags: List[Text]
+    ) -> List[Dict[Text, Any]]:
+        entities = []
+        last_tag = "O"
+        for token, tag in zip(tokens, tags):
+            if tag == "O":
+                last_tag = tag
+                continue
+
+            # new tag found
+            if last_tag != tag:
+                entity = {
+                    "entity": tag,
+                    "start": token.offset,
+                    "end": token.end,
+                    "extractor": "flair",
+                }
+                entities.append(entity)
+
+            # belongs to last entity
+            elif last_tag == tag:
+                entities[-1]["end"] = token.end
+
+            last_tag = tag
+
+        for entity in entities:
+            entity["value"] = text[entity["start"] : entity["end"]]
+
+        return entities
+
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
-        label, label_ranking = self.predict_label(message)
+        if self.intent_classification:
+            label, label_ranking = self.predict_label(message)
+
+            message.set("intent", label, add_to_output=True)
+            message.set("intent_ranking", label_ranking, add_to_output=True)
 
-        message.set("intent", label, add_to_output=True)
-        message.set("intent_ranking", label_ranking, add_to_output=True)
+        if self.named_entity_recognition:
+            entities = self.predict_entities(message)
+
+            message.set("entities", entities, add_to_output=True)
 
     def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         """Persist this model into the passed directory.
@@ -786,19 +1122,26 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         with self.graph.as_default():
             train_utils.persist_tensor("message_placeholder", self.a_in, self.graph)
             train_utils.persist_tensor("label_placeholder", self.b_in, self.graph)
+            train_utils.persist_tensor("tag_placeholder", self.c_in, self.graph)
 
             train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
             train_utils.persist_tensor(
-                "pred_confidence", self.pred_confidence, self.graph
+                "intent_prediction", self.intent_prediction, self.graph
+            )
+            train_utils.persist_tensor(
+                "entity_prediction", self.entity_prediction, self.graph
             )
             train_utils.persist_tensor("similarity", self.sim, self.graph)
 
-            train_utils.persist_tensor("message_embed", self.message_embed, self.graph)
             train_utils.persist_tensor("label_embed", self.label_embed, self.graph)
             train_utils.persist_tensor(
                 "all_labels_embed", self.all_labels_embed, self.graph
             )
 
+            train_utils.persist_tensor(
+                "attention_weights", self.attention_weights, self.graph
+            )
+
             saver = tf.train.Saver()
             saver.save(self.session, checkpoint)
 
@@ -807,6 +1150,9 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         ) as f:
             pickle.dump(self.inverted_label_dict, f)
 
+        with open(os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "wb") as f:
+            pickle.dump(self.inverted_tag_dict, f)
+
         with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "wb") as f:
             pickle.dump(self._tf_config, f)
 
@@ -838,33 +1184,44 @@ def load(
 
                 a_in = train_utils.load_tensor("message_placeholder")
                 b_in = train_utils.load_tensor("label_placeholder")
+                c_in = train_utils.load_tensor("tag_placeholder")
 
                 sim_all = train_utils.load_tensor("similarity_all")
-                pred_confidence = train_utils.load_tensor("pred_confidence")
+                intent_prediction = train_utils.load_tensor("intent_prediction")
+                tag_prediction = train_utils.load_tensor("tag_prediction")
                 sim = train_utils.load_tensor("similarity")
 
-                message_embed = train_utils.load_tensor("message_embed")
                 label_embed = train_utils.load_tensor("label_embed")
                 all_labels_embed = train_utils.load_tensor("all_labels_embed")
 
+                attention_weights = train_utils.load_tensor("attention_weights")
+
             with open(
                 os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "rb"
             ) as f:
                 inv_label_dict = pickle.load(f)
 
+            with open(
+                os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "rb"
+            ) as f:
+                inv_tag_dict = pickle.load(f)
+
             return cls(
                 component_config=meta,
                 inverted_label_dict=inv_label_dict,
+                inverted_tag_dict=inv_tag_dict,
                 session=session,
                 graph=graph,
                 message_placeholder=a_in,
                 label_placeholder=b_in,
+                tag_placeholder=c_in,
                 similarity_all=sim_all,
-                pred_confidence=pred_confidence,
+                intent_prediction=intent_prediction,
+                tag_prediction=tag_prediction,
                 similarity=sim,
-                message_embed=message_embed,
                 label_embed=label_embed,
                 all_labels_embed=all_labels_embed,
+                attention_weights=attention_weights,
             )
 
         else:
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index bd9ec25d1475..f6619004d2d2 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -48,6 +48,9 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
+        if not tokens:
+            return tokens
+
         if (
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
             and self.use_cls_token
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1df1c0b3d79a..46fbee0da3c6 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -340,15 +340,24 @@ def gen_batch(
 
 
 def sparse_to_dense(
-    examples: Union[np.ndarray, List[scipy.sparse.csr_matrix]]
-) -> np.ndarray:
-    # in case of BOW features it'll be either a 2D dense array or list of sparse
-    # matrices 1xN (because sparse vector doesn't exist)
-    # in case of sequence it'll be either a 3D dense array or a list of sparse
-    # matrices seq_lenxN
-    if isinstance(examples[0], scipy.sparse.spmatrix):
-        return np.stack([e.toarray() for e in examples])
-    return examples
+    data: Union[np.ndarray, List[scipy.sparse.spmatrix]], dtype=np.int32, use_zero=False
+):
+    if isinstance(data[0], scipy.sparse.spmatrix):
+        return np.stack([e.toarray() for e in data])
+
+    data_size = len(data)
+    max_seq_len = max([x.shape[0] for x in data])
+    feature_len = max([x.shape[-1] for x in data])
+
+    if use_zero:
+        data_dense = np.zeros([data_size, max_seq_len, feature_len], dtype=dtype)
+    else:
+        data_dense = np.ones([data_size, max_seq_len, feature_len], dtype=dtype) * -1
+
+    for i in range(data_size):
+        data_dense[i, : data[i].shape[0], :] = data[i]
+
+    return data_dense
 
 
 # noinspection PyPep8Naming
@@ -511,6 +520,7 @@ def create_t2t_hparams(
     pos_encoding: Text,
     max_seq_length: int,
     is_training: "tf.Tensor",
+    unidirectional_encoder: bool = True,
 ) -> "HParams":
     """Create parameters for t2t transformer."""
 
@@ -526,7 +536,7 @@ def create_t2t_hparams(
 
     hparams.max_length = max_seq_length
 
-    hparams.unidirectional_encoder = True
+    hparams.unidirectional_encoder = unidirectional_encoder
 
     hparams.self_attention_type = "dot_product_relative_v2"
     hparams.max_relative_position = 5

From 01cbbe48401e0e2e394e02d07afbec641c81dffc Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 7 Nov 2019 14:52:57 +0100
Subject: [PATCH 002/633] adding lm featurizer

---
 .../pretrained_lm_featurizer.py               | 147 ++++++++++++++++++
 rasa/nlu/registry.py                          |   4 +
 requirements.txt                              |   3 +
 3 files changed, 154 insertions(+)
 create mode 100644 rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
new file mode 100644
index 000000000000..165831576ea9
--- /dev/null
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -0,0 +1,147 @@
+import logging
+import os
+import re
+import scipy.sparse
+from typing import Any, Dict, List, Optional, Text
+from rasa.nlu import utils
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.featurizers.featurzier import Featurizer
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_ATTRIBUTES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_INTENT_ATTRIBUTE,
+    SPACY_FEATURIZABLE_ATTRIBUTES,
+)
+import torch
+from transformers import *
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+model_dictionary = {
+    "bert-base-uncased": BertModel,
+    "openai-gpt": OpenAIGPTModel,
+    "gpt2": GPT2Model,
+    "transfo-xl-wt103": TransfoXLModel,
+    "xlnet-base-cased": XLNetModel,
+    "xlm-mlm-enfr-1024": XLMModel,
+    "distilbert-base-uncased": DistilBertModel,
+    "roberta-base": RobertaModel,
+}
+
+tokenizer_dictionary = {
+    "bert-base-uncased": BertTokenizer,
+    "openai-gpt": OpenAIGPTTokenizer,
+    "gpt2": GPT2Tokenizer,
+    "transfo-xl-wt103": TransfoXLTokenizer,
+    "xlnet-base-cased": XLNetTokenizer,
+    "xlm-mlm-enfr-1024": XLMTokenizer,
+    "distilbert-base-uncased": DistilBertTokenizer,
+    "roberta-base": RobertaTokenizer,
+}
+
+special_tokens_present = {
+    "bert-base-uncased": True,
+    "openai-gpt": False,
+    "gpt2": False,
+    "transfo-xl-wt103": False,
+    "xlnet-base-cased": True,
+    "xlm-mlm-enfr-1024": True,
+    "distilbert-base-uncased": True,
+    "roberta-base": True,
+}
+
+
+class PreTrainedLMFeaturizer(Featurizer):
+
+    provides = [
+        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # model key identified by HF Transformers
+        "model_key": "bert-base-uncased"
+    }
+
+    def _load_transformers_params(self):
+
+        self.model_key = self.component_config["model_key"]
+
+        if self.model_key not in tokenizer_dictionary:
+            logger.error("{} not a valid model key name".format(self.model_key))
+            raise
+
+        logger.info("Loading Tokenizer and Model for {}".format(self.model_key))
+        self.tokenizer = tokenizer_dictionary[self.model_key].from_pretrained(
+            self.model_key
+        )
+        self.model = model_dictionary[self.model_key].from_pretrained(self.model_key)
+        self.contains_special_token = special_tokens_present[self.model_key]
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+
+        super(PreTrainedLMFeaturizer, self).__init__(component_config)
+
+        self._load_transformers_params()
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig],
+        **kwargs: Any,
+    ) -> None:
+
+        for example in training_data.intent_examples:
+            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+                self._set_lm_features(example, attribute)
+
+    def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
+
+        message_attribute_text = example.get(attribute)
+        if message_attribute_text:
+            # Encode text
+            input_ids = torch.tensor(
+                [
+                    self.tokenizer.encode(
+                        message_attribute_text,
+                        add_special_tokens=self.contains_special_token,
+                    )
+                ]
+            )  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
+            with torch.no_grad():
+                last_hidden_states = self.model(input_ids)[
+                    0
+                ].numpy()  # Models outputs are now numpy array
+                sequence_embedding = last_hidden_states[0]  # First element of batch
+
+                if self.contains_special_token:
+                    # dim - (seq + 2, hdim)
+                    # Discard SEP token and move CLS token to last index
+                    sequence_embedding = sequence_embedding[:-1]  # Discard SEP
+                    sequence_embedding = np.roll(
+                        sequence_embedding, -1
+                    )  # Move CLS to back
+                else:
+                    sequence_embedding = np.concatenate(
+                        [
+                            sequence_embedding,
+                            np.zeros((1, sequence_embedding.shape[-1])),
+                        ],
+                        axis=0,
+                    )
+
+                features = self._combine_with_existing_dense_features(
+                    example,
+                    sequence_embedding,
+                    MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                )
+                example.set(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute], features)
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        self._set_lm_features(message)
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 6310584c7921..67dc3ee65a3a 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -25,6 +25,9 @@
 from rasa.nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+from rasa.nlu.featurizers.dense_featurizer.pretrained_lm_featurizer import (
+    PreTrainedLMFeaturizer,
+)
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
@@ -64,6 +67,7 @@
     MitieFeaturizer,
     RegexFeaturizer,
     CountVectorsFeaturizer,
+    PreTrainedLMFeaturizer,
     # classifiers
     SklearnIntentClassifier,
     MitieIntentClassifier,
diff --git a/requirements.txt b/requirements.txt
index aa060659c4ce..32a57c3da980 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -57,3 +57,6 @@ PyJWT==1.7.1
 # remove when tensorflow@1.15.x or a pre-release patch is released
 # https://github.com/tensorflow/tensorflow/issues/32319
 gast==0.2.2
+torch
+torchvision
+transformers

From cc87ae2e9382508fa02e80744813c68d316c84c1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 08:21:38 +0100
Subject: [PATCH 003/633] split up train graph into intent and entity

---
 .../embedding_intent_classifier.py            | 167 +++++++++++++-----
 1 file changed, 125 insertions(+), 42 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index bf20fb8dd2ac..b77f62395d16 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -136,6 +136,8 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
         "named_entity_recognition": True,
+        # number of entity tags
+        "num_tags": 0,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -168,8 +170,6 @@ def __init__(
         self.inverted_tag_dict = inverted_tag_dict
         # encode all label_ids with numbers
         self._label_data = None
-        # encode all tag_ids with numbers
-        self._tag_data = None
 
         # tf related instances
         self.session = session
@@ -284,6 +284,7 @@ def _load_params(self) -> None:
         self.named_entity_recognition = self.component_config[
             "named_entity_recognition"
         ]
+        self.num_tags = self.component_config["num_tags"]
 
     # package safety checks
     @classmethod
@@ -457,34 +458,6 @@ def use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
             )
         ]
 
-    def _create_encoded_tag_ids(
-        self,
-        training_data: "TrainingData",
-        tag_id_dict: Dict[Text, int],
-        attribute: Text,
-    ) -> np.ndarray:
-        """Create matrix with tag_ids encoded in rows as bag of words. If the features
-        are already computed, fetch them from the message object else compute a one
-        hot encoding for the label as the feature vector.
-        Find a training example for each tag and get the encoded features from the
-        corresponding Message object."""
-
-        tags_example = []
-
-        # Collect one example for each label
-        for tag_name, idx in tag_id_dict.items():
-            tag_example = self._find_example_for_tag(
-                tag_name, training_data.entity_examples, attribute
-            )
-            tags_example.append((idx, tag_example))
-
-        # Collect features, precomputed if they exist, else compute on the fly
-        features = self._compute_default_label_features(tags_example)
-        encoded_id_tags = [scipy.sparse.csr_matrix(f) for f in features]
-        encoded_id_tags = np.array(encoded_id_tags)
-
-        return encoded_id_tags
-
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
@@ -528,11 +501,6 @@ def _create_session_data(
             if e.get(attribute):
                 label_ids.append(label_id_dict[e.get(attribute)])
 
-            mask = np.zeros(max_seq_len)
-            mask[0 : len(e.get("tokens"))] = 1
-            masks.append(mask)
-
-        for e in training_data:
             _tags = []
             for t in e.get("tokens"):
                 _tag = determine_token_labels(
@@ -541,6 +509,12 @@ def _create_session_data(
                 _tags.append(tag_id_dict[_tag])
             tag_ids.append(scipy.sparse.coo_matrix(np.array([_tags]).T))
 
+            mask = np.zeros(max_seq_len)
+            mask[0 : len(e.get("tokens"))] = 1
+            masks.append(mask)
+
+        X_sparse = np.array(X_sparse)
+        X_dense = np.array(X_dense)
         Y_sparse = np.array(Y_sparse)
         Y_dense = np.array(Y_dense)
         label_ids = np.array(label_ids)
@@ -552,6 +526,7 @@ def _create_session_data(
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
+        self._add_to_session_data(session_data, "tag_ids", [tag_ids])
 
         if "intent_features" not in session_data:
             # no intent features are present, get default features from _label_data
@@ -617,16 +592,47 @@ def _build_tf_train_graph(
         b = self.combine_sparse_dense_features(
             batch_data["intent_features"], session_data["intent_features"], "intent"
         )
+        c = self.combine_sparse_dense_features(
+            batch_data["tag_ids"], session_data["tag_ids"], "tag"
+        )
         all_bs = self.combine_sparse_dense_features(
             label_data["intent_features"], self._label_data["intent_features"], "intent"
         )
+        mask = tf.Tensor()  # TODO
 
-        message_embed = self._create_tf_embed_fnn(
-            a,
-            self.hidden_layer_sizes["text"],
-            fnn_name="text_intent" if self.share_hidden_layers else "text",
-            embed_name="text",
+        # transformer
+        a = self._create_tf_sequence(a, mask)
+
+        if self.intent_classification:
+            return self._train_intent_graph(a, b, all_bs, mask)
+
+        if self.named_entity_recognition:
+            return self._train_entity_graph(a, c, mask)
+
+    def _train_entity_graph(self, a, c, mask):
+        sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
+        sequence_lengths.set_shape([mask.shape[0]])
+
+        c = tf.reduce_sum(tf.nn.relu(c), -1)
+
+        return self._calculate_crf_loss(a, sequence_lengths, c)
+
+    def _train_intent_graph(
+        self, a: "tf.Tensor", b: "tf.Tensor", all_bs: "tf.Tensor", mask: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        last = tf.expand_dims(last, -1)
+
+        # get _cls_ vector for intent classification
+        cls_embed = tf.reduce_sum(a * last, 1)
+        cls_embed = train_utils.create_tf_embed(
+            cls_embed,
+            self.embed_dim,
+            self.C2,
+            self.similarity_type,
+            layer_name_suffix="a",
         )
+
         self.label_embed = self._create_tf_embed_fnn(
             b,
             self.hidden_layer_sizes["intent"],
@@ -641,7 +647,7 @@ def _build_tf_train_graph(
         )
 
         return train_utils.calculate_loss_acc(
-            message_embed,
+            cls_embed,
             self.label_embed,
             b,
             self.all_labels_embed,
@@ -656,6 +662,76 @@ def _build_tf_train_graph(
             self.scale_loss,
         )
 
+    def _calculate_crf_loss(
+        self, inputs: tf.Tensor, sequence_lengths: tf.Tensor, tag_indices: tf.Tensor
+    ):
+        """
+        Args:
+            inputs: tensor (batch-size, max-sequence-length, dimension)
+            sequence_lengths: tensor (batch-size)
+            tag_indices: (batch-size, max-sequence-length)
+        """
+        # CRF
+        crf_params, logits, pred_ids = self._create_crf(inputs, sequence_lengths)
+
+        # Loss
+        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
+            logits, tag_indices, sequence_lengths, crf_params
+        )
+        loss = tf.reduce_mean(-log_likelihood)
+
+        pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
+
+        # Metrics
+        weights = tf.sequence_mask(sequence_lengths)
+        metric = f1(tag_indices, pred_ids, self.num_tags, pos_tag_indices, weights)
+
+        return loss, metric
+
+    def _create_crf(
+        self, input: tf.Tensor, sequence_lengths: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        with tf.variable_scope("ner", reuse=tf.AUTO_REUSE):
+
+            logits = tf.layers.dense(input, self.num_tags, name="crf-logits")
+            crf_params = tf.get_variable(
+                "crf-params", [self.num_tags, self.num_tags], dtype=tf.float32
+            )
+            pred_ids, _ = tf.contrib.crf.crf_decode(
+                logits, crf_params, sequence_lengths
+            )
+
+            return crf_params, logits, pred_ids
+
+    def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
+        """Create sequence level embedding and mask."""
+        a_in = train_utils.create_tf_fnn(
+            a_in,
+            self.hidden_layer_sizes["a"],
+            self.droprate,
+            self.C2,
+            self._is_training,
+            layer_name_suffix="a",
+        )
+
+        self.attention_weights = {}
+        hparams = train_utils.create_t2t_hparams(
+            self.num_transformer_layers,
+            self.transformer_size,
+            self.num_heads,
+            self.droprate,
+            self.pos_encoding,
+            self.max_seq_length,
+            self._is_training,
+            self.unidirectional_encoder,
+        )
+
+        a = train_utils.create_t2t_transformer_encoder(
+            a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
+        )
+
+        return a
+
     def combine_sparse_dense_features(
         self,
         features: List[Union[tf.Tensor, tf.SparseTensor]],
@@ -768,13 +844,20 @@ def preprocess_train_data(self, training_data: "TrainingData"):
             training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 
+        tag_id_dict = self._create_tag_id_dict(
+            training_data, attribute=MESSAGE_TEXT_ATTRIBUTE
+        )
+        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
+
         session_data = self._create_session_data(
-            training_data.intent_examples,
+            training_data.training_examples,
             label_id_dict,
             tag_id_dict,
             attribute=MESSAGE_INTENT_ATTRIBUTE,
         )
 
+        self.num_tags = len(self.inverted_tag_dict)
+
         self.check_input_dimension_consistency(session_data)
 
         return session_data

From d994195f55064de382ecb420e7350a309e2d4e02 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 10:43:15 +0100
Subject: [PATCH 004/633] update train graph

---
 .../embedding_intent_classifier.py            | 48 ++++++++-----------
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6484ee465905..95b09a1b02e8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -494,12 +494,12 @@ def _create_session_data(
                 label_ids.append(label_id_dict[e.get(attribute)])
 
             _tags = []
-            for t in e.get("tokens"):
+            for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
                 _tag = determine_token_labels(
                     t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
                 )
                 _tags.append(tag_id_dict[_tag])
-            tag_ids.append(scipy.sparse.coo_matrix(np.array([_tags]).T))
+            tag_ids.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -584,19 +584,13 @@ def _build_tf_train_graph(
         batch_data, _ = train_utils.batch_to_session_data(self.batch_in, session_data)
         label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
 
-        a = self.combine_sparse_dense_features(
-            batch_data["text_features"], batch_data["text_mask"][0], "text"
-        )
-        b = self.combine_sparse_dense_features(
-            batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
-        )
-        c = self.combine_sparse_dense_features(
-            batch_data["tag_ids"], session_data["tag_ids"], "tag"
-        )
+        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
+        c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
         all_bs = self.combine_sparse_dense_features(
-            label_data["intent_features"], label_data["intent_mask"][0], "intent"
+            label_data["intent_features"], "intent"
         )
-        mask = tf.Tensor()  # TODO
+        mask = batch_data["text_mask"][0]
 
         # transformer
         a = self._create_tf_sequence(a, mask)
@@ -731,10 +725,7 @@ def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         return a
 
     def combine_sparse_dense_features(
-        self,
-        features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask: tf.Tensor,
-        name: Text,
+        self, features: List[Union[tf.Tensor, tf.SparseTensor]], name: Text
     ) -> tf.Tensor:
 
         dense_features = []
@@ -756,11 +747,11 @@ def combine_sparse_dense_features(
 
         output = tf.concat(dense_features, axis=-1)
         # apply mean to convert sequence to sentence features
-        output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
+        # output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
+
         return output
 
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
-
         shapes, types = train_utils.get_shapes_types(session_data)
 
         batch_placeholder = []
@@ -773,39 +764,38 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
             self.batch_in, session_data
         )
 
-        a = self.combine_sparse_dense_features(
-            batch_data["text_features"], batch_data["text_mask"][0], "text"
-        )
-        b = self.combine_sparse_dense_features(
-            batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
-        )
+        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
+        c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
 
         self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
+        if self.intent_classification:
+            return self._pred_intent_graph(a, b)
+        if self.named_entity_recognition:
+            return self._pred_entity_graph(a, c)
+
+    def _pred_intent_graph(self, a, b):
         message_embed = self._create_tf_embed_fnn(
             a,
             self.hidden_layer_sizes["text"],
             fnn_name="text_intent" if self.share_hidden_layers else "text",
             embed_name="text",
         )
-
         self.sim_all = train_utils.tf_raw_sim(
             message_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
             None,
         )
-
         self.label_embed = self._create_tf_embed_fnn(
             b,
             self.hidden_layer_sizes["intent"],
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
             embed_name="intent",
         )
-
         self.sim = train_utils.tf_raw_sim(
             message_embed[:, tf.newaxis, :], self.label_embed, None
         )
-
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
 
     @staticmethod

From 6dc6535a51f5836d4643c7fa324cb0eb6aef4183 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 11:55:47 +0100
Subject: [PATCH 005/633] update training metrics

---
 rasa/core/policies/embedding_policy.py        |  3 +-
 .../embedding_intent_classifier.py            | 67 ++++++++++------
 rasa/utils/train_utils.py                     | 76 +++++++++----------
 3 files changed, 80 insertions(+), 66 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index a353185d1e7b..b7bdb05a71b6 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -487,8 +487,7 @@ def train(
                 train_init_op,
                 eval_init_op,
                 batch_size_in,
-                loss,
-                acc,
+                {"loss": [loss], "acc": [acc]},
                 self._train_op,
                 self.session,
                 self._is_training,
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8e65d1eac698..6cb3315eae76 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -135,7 +135,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # if true intent classification is trained and intent predicted
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
-        "named_entity_recognition": True,
+        "named_entity_recognition": False,
         # number of entity tags
         "num_tags": 0,
     }
@@ -499,7 +499,9 @@ def _create_session_data(
                     t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
                 )
                 _tags.append(tag_id_dict[_tag])
-            tag_ids.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
+            tag_ids.append(
+                scipy.sparse.csr_matrix(np.array([_tags]).T.astype(np.float64))
+            )
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -572,7 +574,7 @@ def _create_tf_embed_fnn(
 
     def _build_tf_train_graph(
         self, session_data: SessionData
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+    ) -> Dict[Text, List[tf.Tensor]]:
 
         # get in tensors from generator
         self.batch_in = self._iterator.get_next()
@@ -687,7 +689,7 @@ def _create_crf(
 
             logits = tf.layers.dense(input, self.num_tags, name="crf-logits")
             crf_params = tf.get_variable(
-                "crf-params", [self.num_tags, self.num_tags], dtype=tf.float32
+                "crf-params", [self.num_tags, self.num_tags], dtype=tf.float64
             )
             pred_ids, _ = tf.contrib.crf.crf_decode(
                 logits, crf_params, sequence_lengths
@@ -699,11 +701,11 @@ def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         """Create sequence level embedding and mask."""
         a_in = train_utils.create_tf_fnn(
             a_in,
-            self.hidden_layer_sizes["a"],
+            self.hidden_layer_sizes["text"],
             self.droprate,
             self.C2,
             self._is_training,
-            layer_name_suffix="a",
+            layer_name_suffix="text",
         )
 
         self.attention_weights = {}
@@ -775,17 +777,26 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         if self.named_entity_recognition:
             return self._pred_entity_graph(a, c)
 
-    def _pred_intent_graph(self, a, b):
-        message_embed = self._create_tf_embed_fnn(
-            a,
-            self.hidden_layer_sizes["text"],
-            fnn_name="text_intent" if self.share_hidden_layers else "text",
-            embed_name="text",
+    def _pred_intent_graph(self, a, b, mask):
+        last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        last = tf.expand_dims(last, -1)
+
+        # get _cls_ embedding
+        cls_embed = tf.reduce_sum(a * last, 1)
+        cls_embed = train_utils.create_tf_embed(
+            cls_embed,
+            self.embed_dim,
+            self.C2,
+            self.similarity_type,
+            layer_name_suffix="a",
         )
+
+        # reduce dimensionality as input should not be sequence for intent
+        # classification
+        self.b_in = tf.reduce_sum(self.b_in, 1)
+
         self.sim_all = train_utils.tf_raw_sim(
-            message_embed[:, tf.newaxis, :],
-            self.all_labels_embed[tf.newaxis, :, :],
-            None,
+            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :], None
         )
         self.label_embed = self._create_tf_embed_fnn(
             b,
@@ -794,9 +805,20 @@ def _pred_intent_graph(self, a, b):
             embed_name="intent",
         )
         self.sim = train_utils.tf_raw_sim(
-            message_embed[:, tf.newaxis, :], self.label_embed, None
+            cls_embed[:, tf.newaxis, :], self.label_embed, None
+        )
+
+        self.intent_prediction = train_utils.confidence_from_sim(
+            self.sim_all, self.similarity_type
         )
-        return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
+
+    def _pred_entity_graph(self, a, c, mask):
+        # get sequence lengths for NER
+        sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
+
+        # predict tagsx
+        _, _, pred_ids = self._create_crf(a, sequence_lengths)
+        self.tag_prediction = tf.to_int64(pred_ids)
 
     @staticmethod
     def _get_num_of_features(session_data: "SessionData", key_prefix: Text) -> int:
@@ -835,7 +857,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         )
 
         tag_id_dict = self._create_tag_id_dict(
-            training_data, attribute=MESSAGE_TEXT_ATTRIBUTE
+            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
         )
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
@@ -912,20 +934,21 @@ def train(
 
             self._is_training = tf.placeholder_with_default(False, shape=())
 
-            loss, acc = self._build_tf_train_graph(session_data)
+            metrics = self._build_tf_train_graph(session_data)
+
+            loss = sum(metrics["loss"])
 
             # define which optimizer to use
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)
-            # TODO proper loss, acc handling
+
             train_utils.train_tf_dataset(
                 train_init_op,
                 eval_init_op,
                 batch_size_in,
-                loss,
-                intent_acc,
+                metrics,
                 self._train_op,
                 self.session,
                 self._is_training,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 00e462fceedf..6673422e9279 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1104,8 +1104,7 @@ def linearly_increasing_batch_size(
 
 def output_validation_stat(
     eval_init_op: "tf.Operation",
-    loss: "tf.Tensor",
-    acc: "tf.Tensor",
+    metrics: Dict[Text, List["tf.Tensor"]],
     session: "tf.Session",
     is_training: "tf.Session",
     batch_size_in: "tf.Tensor",
@@ -1114,29 +1113,30 @@ def output_validation_stat(
     """Output training statistics"""
 
     session.run(eval_init_op, feed_dict={batch_size_in: ep_batch_size})
-    ep_val_loss = 0
-    ep_val_acc = 0
+    ep_val_metrics = {k: [0] * len(v) for k, v in metrics.items()}
     batches_per_epoch = 0
     while True:
         try:
-            batch_val_loss, batch_val_acc = session.run(
-                [loss, acc], feed_dict={is_training: False}
-            )
+            batch_val_metrics = session.run([metrics], feed_dict={is_training: False})
             batches_per_epoch += 1
-            ep_val_loss += batch_val_loss
-            ep_val_acc += batch_val_acc
+            for k, values in batch_val_metrics.items():
+                for i, v in enumerate(values):
+                    ep_val_metrics[k][i] += v
         except tf.errors.OutOfRangeError:
             break
 
-    return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch
+    for k, values in ep_val_metrics:
+        for i, v in enumerate(values):
+            ep_val_metrics[k][i] = v / batches_per_epoch
+
+    return ep_val_metrics
 
 
 def train_tf_dataset(
     train_init_op: "tf.Operation",
     eval_init_op: "tf.Operation",
     batch_size_in: "tf.Tensor",
-    loss: "tf.Tensor",
-    acc: "tf.Tensor",
+    metrics: Dict[Text, List[tf.Tensor]],
     train_op: "tf.Tensor",
     session: "tf.Session",
     is_training: "tf.Session",
@@ -1156,65 +1156,57 @@ def train_tf_dataset(
         )
     pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
 
-    train_loss = 0
-    train_acc = 0
-    val_loss = 0
-    val_acc = 0
+    train_metrics = {k: [0] * len(v) for k, v in metrics.items()}
+    val_metrics = {k: [0] * len(v) for k, v in metrics.items()}
     for ep in pbar:
 
         ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
 
         session.run(train_init_op, feed_dict={batch_size_in: ep_batch_size})
 
-        ep_train_loss = 0
-        ep_train_acc = 0
+        ep_train_metrics = {k: [0] * len(v) for k, v in metrics.items()}
         batches_per_epoch = 0
         while True:
             try:
-                _, batch_train_loss, batch_train_acc = session.run(
-                    [train_op, loss, acc], feed_dict={is_training: True}
+                _, batch_train_metrics = session.run(
+                    [train_op, metrics], feed_dict={is_training: True}
                 )
                 batches_per_epoch += 1
-                ep_train_loss += batch_train_loss
-                ep_train_acc += batch_train_acc
+                for k, values in batch_train_metrics.items():
+                    for i, v in enumerate(values):
+                        ep_train_metrics[k][i] += v
 
             except tf.errors.OutOfRangeError:
                 break
 
-        train_loss = ep_train_loss / batches_per_epoch
-        train_acc = ep_train_acc / batches_per_epoch
+        for k, values in ep_train_metrics:
+            for i, v in enumerate(values):
+                train_metrics[k][i] = v / batches_per_epoch
 
-        postfix_dict = {"loss": f"{train_loss:.3f}", "acc": f"{train_acc:.3f}"}
+        postfix_dict = {}
+        for k, values in train_metrics:
+            for i, v in enumerate(values):
+                postfix_dict[f"{k} {i}"] = f"{v:.3f}"
 
         if eval_init_op is not None:
             if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
-                val_loss, val_acc = output_validation_stat(
+                val_metrics = output_validation_stat(
                     eval_init_op,
-                    loss,
-                    acc,
+                    metrics,
                     session,
                     is_training,
                     batch_size_in,
                     ep_batch_size,
                 )
 
-            postfix_dict.update(
-                {"val_loss": f"{val_loss:.3f}", "val_acc": f"{val_acc:.3f}"}
-            )
+            postfix_dict = {}
+            for k, values in val_metrics:
+                for i, v in enumerate(values):
+                    postfix_dict[f"val {k} {i}"] = f"{v:.3f}"
 
         pbar.set_postfix(postfix_dict)
 
-    final_message = (
-        "Finished training embedding policy, "
-        "train loss={:.3f}, train accuracy={:.3f}"
-        "".format(train_loss, train_acc)
-    )
-    if eval_init_op is not None:
-        final_message += (
-            ", validation loss={:.3f}, validation accuracy={:.3f}"
-            "".format(val_loss, val_acc)
-        )
-    logger.info(final_message)
+    logger.info("Finished training.")
 
 
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:

From ffebffa1a71ddb0f1fd2b254b9b598ef5f11198b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 13:58:21 +0100
Subject: [PATCH 006/633] cast tensors to correct type

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 12 +++++-------
 rasa/utils/train_utils.py                           |  9 +++++++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6cb3315eae76..3811745bc55a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -500,7 +500,7 @@ def _create_session_data(
                 )
                 _tags.append(tag_id_dict[_tag])
             tag_ids.append(
-                scipy.sparse.csr_matrix(np.array([_tags]).T.astype(np.float64))
+                scipy.sparse.csr_matrix(np.array([_tags]).T)  # TODO coo matrix
             )
 
         X_sparse = np.array(X_sparse)
@@ -605,7 +605,6 @@ def _build_tf_train_graph(
 
     def _train_entity_graph(self, a, c, mask):
         sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
-        sequence_lengths.set_shape([mask.shape[0]])
 
         c = tf.reduce_sum(tf.nn.relu(c), -1)
 
@@ -699,13 +698,11 @@ def _create_crf(
 
     def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         """Create sequence level embedding and mask."""
-        a_in = train_utils.create_tf_fnn(
+        a_in = self._create_tf_embed_fnn(
             a_in,
             self.hidden_layer_sizes["text"],
-            self.droprate,
-            self.C2,
-            self._is_training,
-            layer_name_suffix="text",
+            fnn_name="text_intent" if self.share_hidden_layers else "text",
+            embed_name="text",
         )
 
         self.attention_weights = {}
@@ -914,6 +911,7 @@ def train(
 
         self.graph = tf.Graph()
         with self.graph.as_default():
+            # tf.enable_eager_execution()
             # set random seed
             tf.set_random_seed(self.random_seed)
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 6673422e9279..075316bb7ac4 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -530,6 +530,7 @@ def create_tf_fnn(
             name=f"hidden_layer_{layer_name_suffix}_{i}",
             reuse=tf.AUTO_REUSE,
         )
+        droprate = tf.cast(droprate, x.dtype)
         x = tf.layers.dropout(x, rate=droprate, training=is_training)
     return x
 
@@ -648,7 +649,9 @@ def create_t2t_transformer_encoder(
 
         x *= tf.expand_dims(mask, -1)
 
-        x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+        x = tf.nn.dropout(
+            x, tf.cast(1.0 - hparams.layer_prepostprocess_dropout, x.dtype)
+        )
 
         attn_bias_for_padding = None
         # Otherwise the encoder will just use encoder_self_attention_bias.
@@ -666,7 +669,9 @@ def create_t2t_transformer_encoder(
 
         x *= tf.expand_dims(mask, -1)
 
-        return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
+        return tf.nn.dropout(
+            tf.nn.relu(x), tf.cast(1.0 - hparams.layer_prepostprocess_dropout, x.dtype)
+        )
 
 
 def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":

From e959f99932539b4c8b8ce162df725ae353c1599a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 14:26:03 +0100
Subject: [PATCH 007/633] cast tensor for tranformer

---
 .../classifiers/embedding_intent_classifier.py    |  6 ++++++
 rasa/utils/train_utils.py                         | 15 ++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3811745bc55a..8cf716210bc6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -717,10 +717,16 @@ def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
             self.unidirectional_encoder,
         )
 
+        a_in = tf.cast(a_in, tf.float32)
+        mask = tf.cast(mask, tf.float32)
+
         a = train_utils.create_t2t_transformer_encoder(
             a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
         )
 
+        a = tf.cast(a, tf.float64)
+        mask = tf.cast(mask, tf.float64)
+
         return a
 
     def combine_sparse_dense_features(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 075316bb7ac4..c63771be33e7 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -640,14 +640,20 @@ def create_t2t_transformer_encoder(
         if hparams.multiply_embedding_mode == "sqrt_depth":
             x *= hparams.hidden_size ** 0.5
 
-        x *= tf.expand_dims(mask, -1)
+        if len(mask.shape) == 2:
+            x *= tf.expand_dims(mask, -1)
+        else:
+            x *= mask
         (
             x,
             self_attention_bias,
             encoder_decoder_attention_bias,
         ) = transformer_prepare_encoder(x, None, hparams)
 
-        x *= tf.expand_dims(mask, -1)
+        if len(mask.shape) == 2:
+            x *= tf.expand_dims(mask, -1)
+        else:
+            x *= mask
 
         x = tf.nn.dropout(
             x, tf.cast(1.0 - hparams.layer_prepostprocess_dropout, x.dtype)
@@ -667,7 +673,10 @@ def create_t2t_transformer_encoder(
             attn_bias_for_padding=attn_bias_for_padding,
         )
 
-        x *= tf.expand_dims(mask, -1)
+        if len(mask.shape) == 2:
+            x *= tf.expand_dims(mask, -1)
+        else:
+            x *= mask
 
         return tf.nn.dropout(
             tf.nn.relu(x), tf.cast(1.0 - hparams.layer_prepostprocess_dropout, x.dtype)

From 3171e5b8bd65d1b078ece538b47f30c396644b89 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 10:38:34 +0100
Subject: [PATCH 008/633] clean up after merge

---
 .../embedding_intent_classifier.py            | 129 +++++++++---------
 rasa/utils/train_utils.py                     |   9 +-
 2 files changed, 63 insertions(+), 75 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index dd0a9237fef1..216eef118ffe 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -141,61 +141,9 @@ class EmbeddingIntentClassifier(EntityExtractor):
     }
     # end default properties (DOC MARKER - don't remove)
 
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        session: Optional["tf.Session"] = None,
-        graph: Optional["tf.Graph"] = None,
-        batch_placeholder: Optional["tf.Tensor"] = None,
-        similarity_all: Optional["tf.Tensor"] = None,
-        intent_prediction: Optional["tf.Tensor"] = None,
-        tag_prediction: Optional["tf.Tensor"] = None,
-        similarity: Optional["tf.Tensor"] = None,
-        label_embed: Optional["tf.Tensor"] = None,
-        all_labels_embed: Optional["tf.Tensor"] = None,
-        attention_weights: Optional["tf.Tensor"] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
-    ) -> None:
-        """Declare instant variables with default values"""
-
-        super().__init__(component_config)
-
-        self._load_params()
-
-        # transform numbers to labels
-        self.inverted_label_dict = inverted_label_dict
-        # transform numbers to tags
-        self.inverted_tag_dict = inverted_tag_dict
-        # encode all label_ids with numbers
-        self._label_data = None
-
-        # tf related instances
-        self.session = session
-        self.graph = graph
-
-        self.batch_in = batch_placeholder
-
-        self.sim_all = similarity_all
-        self.intent_prediction = intent_prediction
-        self.entity_prediction = tag_prediction
-        self.sim = similarity
-        self.attention_weights = attention_weights
-
-        # persisted embeddings
-        self.label_embed = label_embed
-        self.all_labels_embed = all_labels_embed
-
-        # internal tf instances
-        self._iterator = None
-        self._train_op = None
-        self._is_training = None
-
-        self.batch_tuple_sizes = batch_tuple_sizes
-
-    # config migration warning
-    def _check_old_config_variables(self, config: Dict[Text, Any]) -> None:
+    @staticmethod
+    def _check_old_config_variables(config: Dict[Text, Any]) -> None:
+        """Config migration warning"""
         removed_tokenization_params = [
             "intent_tokenization_flag",
             "intent_split_symbol",
@@ -272,7 +220,6 @@ def _load_visual_params(self, config: Dict[Text, Any]) -> None:
         self.evaluate_on_num_examples = config["evaluate_on_num_examples"]
 
     def _load_params(self) -> None:
-
         self._check_old_config_variables(self.component_config)
         self._tf_config = train_utils.load_tf_config(self.component_config)
         self._load_nn_architecture_params(self.component_config)
@@ -291,6 +238,60 @@ def _load_params(self) -> None:
     def required_packages(cls) -> List[Text]:
         return ["tensorflow"]
 
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        session: Optional["tf.Session"] = None,
+        graph: Optional["tf.Graph"] = None,
+        batch_placeholder: Optional["tf.Tensor"] = None,
+        similarity_all: Optional["tf.Tensor"] = None,
+        intent_prediction: Optional["tf.Tensor"] = None,
+        entity_prediction: Optional["tf.Tensor"] = None,
+        similarity: Optional["tf.Tensor"] = None,
+        message_embed: Optional["tf.Tensor"] = None,
+        label_embed: Optional["tf.Tensor"] = None,
+        all_labels_embed: Optional["tf.Tensor"] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+        attention_weights: Optional["tf.Tensor"] = None,
+    ) -> None:
+        """Declare instant variables with default values"""
+
+        super().__init__(component_config)
+
+        self._load_params()
+
+        # transform numbers to labels
+        self.inverted_label_dict = inverted_label_dict
+        self.inverted_tag_dict = inverted_tag_dict
+        # encode all label_ids with numbers
+        self._label_data = None
+
+        # tf related instances
+        self.session = session
+        self.graph = graph
+        self.batch_in = batch_placeholder
+        self.sim_all = similarity_all
+        self.intent_prediction = intent_prediction
+        self.entity_prediction = entity_prediction
+        self.sim = similarity
+
+        # persisted embeddings
+        self.message_embed = message_embed
+        self.label_embed = label_embed
+        self.all_labels_embed = all_labels_embed
+
+        # keep the input tuple sizes in self.batch_in
+        self.batch_tuple_sizes = batch_tuple_sizes
+
+        # internal tf instances
+        self._iterator = None
+        self._train_op = None
+        self._is_training = None
+
+        self.attention_weights = attention_weights
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -509,9 +510,7 @@ def _create_session_data(
                     t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
                 )
                 _tags.append(tag_id_dict[_tag])
-            tag_ids.append(
-                scipy.sparse.csr_matrix(np.array([_tags]).T)  # TODO coo matrix
-            )
+            tag_ids.append(scipy.sparse.coo_matrix(np.array([_tags]).T))
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -733,7 +732,7 @@ def _create_crf(
 
             logits = tf.layers.dense(input, self.num_tags, name="crf-logits")
             crf_params = tf.get_variable(
-                "crf-params", [self.num_tags, self.num_tags], dtype=tf.float64
+                "crf-params", [self.num_tags, self.num_tags], dtype=tf.float32
             )
             pred_ids, _ = tf.contrib.crf.crf_decode(
                 logits, crf_params, sequence_lengths
@@ -762,16 +761,10 @@ def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
             self.unidirectional_encoder,
         )
 
-        a_in = tf.cast(a_in, tf.float32)
-        mask = tf.cast(mask, tf.float32)
-
         a = train_utils.create_t2t_transformer_encoder(
             a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
         )
 
-        a = tf.cast(a, tf.float64)
-        mask = tf.cast(mask, tf.float64)
-
         return a
 
     def combine_sparse_dense_features(
@@ -1243,7 +1236,7 @@ def load(
 
                 sim_all = train_utils.load_tensor("similarity_all")
                 intent_prediction = train_utils.load_tensor("intent_prediction")
-                tag_prediction = train_utils.load_tensor("tag_prediction")
+                entity_prediction = train_utils.load_tensor("entity_prediction")
                 sim = train_utils.load_tensor("similarity")
 
                 message_embed = train_utils.load_tensor("message_embed")
@@ -1276,7 +1269,7 @@ def load(
                 batch_placeholder=batch_in,
                 similarity_all=sim_all,
                 intent_prediction=intent_prediction,
-                tag_prediction=tag_prediction,
+                entity_prediction=entity_prediction,
                 similarity=sim,
                 message_embed=message_embed,
                 label_embed=label_embed,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 7602a87b4dce..0dadb5b2cb11 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -585,7 +585,6 @@ def create_tf_fnn(
             name=f"hidden_layer_{layer_name_suffix}_{i}",
             reuse=tf.AUTO_REUSE,
         )
-        droprate = tf.cast(droprate, x.dtype)
         x = tf.layers.dropout(x, rate=droprate, training=is_training)
     return x
 
@@ -709,9 +708,7 @@ def create_t2t_transformer_encoder(
         else:
             x *= mask
 
-        x = tf.nn.dropout(
-            x, tf.cast(1.0 - hparams.layer_prepostprocess_dropout, x.dtype)
-        )
+        x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
 
         attn_bias_for_padding = None
         # Otherwise the encoder will just use encoder_self_attention_bias.
@@ -732,9 +729,7 @@ def create_t2t_transformer_encoder(
         else:
             x *= mask
 
-        return tf.nn.dropout(
-            tf.nn.relu(x), tf.cast(1.0 - hparams.layer_prepostprocess_dropout, x.dtype)
-        )
+        return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
 
 
 def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":

From 70b4f28c68c4b130eb74d78bd95c48d1792e6fb6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 11:10:01 +0100
Subject: [PATCH 009/633] intent classification works

---
 .../embedding_intent_classifier.py            | 89 +++++++------------
 rasa/utils/train_utils.py                     | 12 +--
 2 files changed, 36 insertions(+), 65 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 216eef118ffe..24660f9580e2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -504,13 +504,14 @@ def _create_session_data(
             if label_attribute and e.get(label_attribute):
                 label_ids.append(label_id_dict[e.get(label_attribute)])
 
-            _tags = []
-            for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
-                _tag = determine_token_labels(
-                    t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
-                )
-                _tags.append(tag_id_dict[_tag])
-            tag_ids.append(scipy.sparse.coo_matrix(np.array([_tags]).T))
+            if tag_id_dict:
+                _tags = []
+                for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
+                    _tag = determine_token_labels(
+                        t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
+                    )
+                    _tags.append(tag_id_dict[_tag])
+                tag_ids.append(scipy.sparse.coo_matrix(np.array([_tags]).T))
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -589,11 +590,8 @@ def _create_tf_embed_fnn(
         )
 
     def combine_sparse_dense_features(
-        self,
-        features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask: tf.Tensor,
-        name: Text,
-    ) -> tf.Tensor:
+        self, features: List[Union["tf.Tensor", "tf.SparseTensor"]], name: Text
+    ) -> "tf.Tensor":
 
         dense_features = []
 
@@ -612,14 +610,11 @@ def combine_sparse_dense_features(
             else:
                 dense_features.append(f)
 
-        output = tf.concat(dense_features, axis=-1)
-        # apply mean to convert sequence to sentence features
-        output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
-        return output
+        return tf.concat(dense_features, axis=-1)
 
     def _build_tf_train_graph(
         self, session_data: SessionDataType
-    ) -> Dict[Text, List[tf.Tensor]]:
+    ) -> Dict[Text, List["tf.Tensor"]]:
 
         # get in tensors from generator
         self.batch_in = self._iterator.get_next()
@@ -656,9 +651,8 @@ def _train_entity_graph(self, a, c, mask):
 
     def _train_intent_graph(
         self, a: "tf.Tensor", b: "tf.Tensor", all_bs: "tf.Tensor", mask: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+    ) -> Dict[Text, List["tf.Tensor"]]:
         last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        last = tf.expand_dims(last, -1)
 
         # get _cls_ vector for intent classification
         cls_embed = tf.reduce_sum(a * last, 1)
@@ -667,9 +661,12 @@ def _train_intent_graph(
             self.embed_dim,
             self.C2,
             self.similarity_type,
-            layer_name_suffix="a",
+            layer_name_suffix="cls",
         )
 
+        b = tf.reduce_sum(tf.nn.relu(b), 1)
+        all_bs = tf.reduce_sum(tf.nn.relu(all_bs), 1)
+
         self.label_embed = self._create_tf_embed_fnn(
             b,
             self.hidden_layer_sizes["intent"],
@@ -683,7 +680,7 @@ def _train_intent_graph(
             embed_name="intent",
         )
 
-        return train_utils.calculate_loss_acc(
+        loss, acc = train_utils.calculate_loss_acc(
             cls_embed,
             self.label_embed,
             b,
@@ -699,6 +696,8 @@ def _train_intent_graph(
             self.scale_loss,
         )
 
+        return {"loss": [loss], "acc": [acc]}
+
     def _calculate_crf_loss(
         self, inputs: tf.Tensor, sequence_lengths: tf.Tensor, tag_indices: tf.Tensor
     ):
@@ -767,33 +766,6 @@ def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
 
         return a
 
-    def combine_sparse_dense_features(
-        self, features: List[Union[tf.Tensor, tf.SparseTensor]], name: Text
-    ) -> tf.Tensor:
-
-        dense_features = []
-
-        dense_dim = self.dense_dim
-        # if dense features are present use the feature dimension of the dense features
-        for f in features:
-            if not isinstance(f, tf.SparseTensor):
-                dense_dim = f.shape[-1]
-                break
-
-        for f in features:
-            if isinstance(f, tf.SparseTensor):
-                dense_features.append(
-                    train_utils.tf_dense_layer_for_sparse(f, dense_dim, name, self.C2)
-                )
-            else:
-                dense_features.append(f)
-
-        output = tf.concat(dense_features, axis=-1)
-        # apply mean to convert sequence to sentence features
-        # output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
-
-        return output
-
     def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
 
         shapes, types = train_utils.get_shapes_types(session_data)
@@ -811,17 +783,20 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
         a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
         b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
         c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
+        mask = batch_data["text_mask"][0]
 
         self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
+        # transformer
+        a = self._create_tf_sequence(a, mask)
+
         if self.intent_classification:
-            return self._pred_intent_graph(a, b)
+            return self._pred_intent_graph(a, b, mask)
         if self.named_entity_recognition:
-            return self._pred_entity_graph(a, c)
+            return self._pred_entity_graph(a, c, mask)
 
-    def _pred_intent_graph(self, a, b, mask):
+    def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
         last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        last = tf.expand_dims(last, -1)
 
         # get _cls_ embedding
         cls_embed = tf.reduce_sum(a * last, 1)
@@ -830,12 +805,10 @@ def _pred_intent_graph(self, a, b, mask):
             self.embed_dim,
             self.C2,
             self.similarity_type,
-            layer_name_suffix="text",
+            layer_name_suffix="cls",
         )
 
-        # reduce dimensionality as input should not be sequence for intent
-        # classification
-        self.b_in = tf.reduce_sum(self.b_in, 1)
+        self.b = tf.reduce_sum(b, 1)
 
         self.sim_all = train_utils.tf_raw_sim(
             cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :], None
@@ -959,7 +932,6 @@ def train(
 
         self.graph = tf.Graph()
         with self.graph.as_default():
-            # tf.enable_eager_execution()
             # set random seed
             tf.set_random_seed(self.random_seed)
 
@@ -982,9 +954,8 @@ def train(
 
             metrics = self._build_tf_train_graph(session_data)
 
-            loss = sum(metrics["loss"])
-
             # define which optimizer to use
+            loss = tf.add_n(metrics["loss"])
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
             # train tensorflow graph
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 0dadb5b2cb11..40e237e99884 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1120,7 +1120,7 @@ def output_validation_stat(
     is_training: "tf.Session",
     batch_size_in: "tf.Tensor",
     ep_batch_size: int,
-) -> Tuple[float, float]:
+) -> Dict[Text, List[float]]:
     """Output training statistics"""
 
     session.run(eval_init_op, feed_dict={batch_size_in: ep_batch_size})
@@ -1136,7 +1136,7 @@ def output_validation_stat(
         except tf.errors.OutOfRangeError:
             break
 
-    for k, values in ep_val_metrics:
+    for k, values in ep_val_metrics.items():
         for i, v in enumerate(values):
             ep_val_metrics[k][i] = v / batches_per_epoch
 
@@ -1190,14 +1190,14 @@ def train_tf_dataset(
             except tf.errors.OutOfRangeError:
                 break
 
-        for k, values in ep_train_metrics:
+        for k, values in ep_train_metrics.items():
             for i, v in enumerate(values):
                 train_metrics[k][i] = v / batches_per_epoch
 
         postfix_dict = {}
-        for k, values in train_metrics:
+        for k, values in train_metrics.items():
             for i, v in enumerate(values):
-                postfix_dict[f"{k} {i}"] = f"{v:.3f}"
+                postfix_dict[f"{k}_{i}"] = f"{v:.3f}"
 
         if eval_init_op is not None:
             if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
@@ -1213,7 +1213,7 @@ def train_tf_dataset(
             postfix_dict = {}
             for k, values in val_metrics:
                 for i, v in enumerate(values):
-                    postfix_dict[f"val {k} {i}"] = f"{v:.3f}"
+                    postfix_dict[f"val_{k}_{i}"] = f"{v:.3f}"
 
         pbar.set_postfix(postfix_dict)
 

From c32fe427ee88dac75f46d772256f52ad7856d0f9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 11:15:34 +0100
Subject: [PATCH 010/633] restructure file

---
 .../embedding_intent_classifier.py            | 413 +++++++++---------
 1 file changed, 207 insertions(+), 206 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 24660f9580e2..ed612915a049 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -382,6 +382,75 @@ def _extract_and_add_features(
 
         return sparse_features, dense_features
 
+    @staticmethod
+    def _compute_default_label_features(
+        labels_example: List["Message"],
+    ) -> List[np.ndarray]:
+        """Compute one-hot representation for the labels"""
+
+        return [
+            np.array(
+                [
+                    scipy.sparse.coo_matrix(
+                        ([1], ([0], [idx])), shape=(1, len(labels_example))
+                    )
+                    for idx in range(len(labels_example))
+                ]
+            )
+        ]
+
+    @staticmethod
+    def _add_to_session_data(
+        session_data: SessionDataType, key: Text, features: List[np.ndarray]
+    ):
+        if not features:
+            return
+
+        session_data[key] = []
+
+        for data in features:
+            if data.size > 0:
+                session_data[key].append(data)
+
+    @staticmethod
+    def _add_mask_to_session_data(
+        session_data: SessionDataType, key: Text, from_key: Text
+    ):
+
+        session_data[key] = []
+
+        for data in session_data[from_key]:
+            if data.size > 0:
+                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
+                session_data[key].append(mask)
+                break
+
+    @staticmethod
+    def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
+        num_features = 0
+        for data in session_data[key]:
+            if data.size > 0:
+                num_features += data[0].shape[-1]
+        return num_features
+
+    @staticmethod
+    def _check_enough_labels(session_data: "SessionDataType") -> bool:
+        return len(np.unique(session_data["intent_ids"])) >= 2
+
+    def check_input_dimension_consistency(self, session_data: "SessionDataType"):
+        if self.share_hidden_layers:
+            num_text_features = self._get_num_of_features(session_data, "text_features")
+            num_intent_features = self._get_num_of_features(
+                session_data, "intent_features"
+            )
+
+            if num_text_features != num_intent_features:
+                raise ValueError(
+                    "If embeddings are shared "
+                    "text features and label features "
+                    "must coincide. Check the output dimensions of previous components."
+                )
+
     def _extract_labels_precomputed_features(
         self, label_examples: List["Message"]
     ) -> List[np.ndarray]:
@@ -404,23 +473,6 @@ def _extract_labels_precomputed_features(
 
         return [sparse_features, dense_features]
 
-    @staticmethod
-    def _compute_default_label_features(
-        labels_example: List["Message"],
-    ) -> List[np.ndarray]:
-        """Compute one-hot representation for the labels"""
-
-        return [
-            np.array(
-                [
-                    scipy.sparse.coo_matrix(
-                        ([1], ([0], [idx])), shape=(1, len(labels_example))
-                    )
-                    for idx in range(len(labels_example))
-                ]
-            )
-        ]
-
     def _create_label_data(
         self,
         training_data: "TrainingData",
@@ -469,7 +521,6 @@ def use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
             )
         ]
 
-    # noinspection PyPep8Naming
     def _create_session_data(
         self,
         training_data: List["Message"],
@@ -537,32 +588,6 @@ def _create_session_data(
 
         return session_data
 
-    @staticmethod
-    def _add_to_session_data(
-        session_data: SessionDataType, key: Text, features: List[np.ndarray]
-    ):
-        if not features:
-            return
-
-        session_data[key] = []
-
-        for data in features:
-            if data.size > 0:
-                session_data[key].append(data)
-
-    @staticmethod
-    def _add_mask_to_session_data(
-        session_data: SessionDataType, key: Text, from_key: Text
-    ):
-
-        session_data[key] = []
-
-        for data in session_data[from_key]:
-            if data.size > 0:
-                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
-                session_data[key].append(mask)
-                break
-
     # tf helpers:
     def _create_tf_embed_fnn(
         self,
@@ -612,6 +637,34 @@ def combine_sparse_dense_features(
 
         return tf.concat(dense_features, axis=-1)
 
+    def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
+        """Create sequence level embedding and mask."""
+        a_in = self._create_tf_embed_fnn(
+            a_in,
+            self.hidden_layer_sizes["text"],
+            fnn_name="text_intent" if self.share_hidden_layers else "text",
+            embed_name="text",
+        )
+
+        self.attention_weights = {}
+        hparams = train_utils.create_t2t_hparams(
+            self.num_transformer_layers,
+            self.transformer_size,
+            self.num_heads,
+            self.droprate,
+            self.pos_encoding,
+            self.max_seq_length,
+            self._is_training,
+            self.unidirectional_encoder,
+        )
+
+        a = train_utils.create_t2t_transformer_encoder(
+            a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
+        )
+
+        return a
+
+    # build tf graphs:
     def _build_tf_train_graph(
         self, session_data: SessionDataType
     ) -> Dict[Text, List["tf.Tensor"]]:
@@ -739,33 +792,6 @@ def _create_crf(
 
             return crf_params, logits, pred_ids
 
-    def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
-        """Create sequence level embedding and mask."""
-        a_in = self._create_tf_embed_fnn(
-            a_in,
-            self.hidden_layer_sizes["text"],
-            fnn_name="text_intent" if self.share_hidden_layers else "text",
-            embed_name="text",
-        )
-
-        self.attention_weights = {}
-        hparams = train_utils.create_t2t_hparams(
-            self.num_transformer_layers,
-            self.transformer_size,
-            self.num_heads,
-            self.droprate,
-            self.pos_encoding,
-            self.max_seq_length,
-            self._is_training,
-            self.unidirectional_encoder,
-        )
-
-        a = train_utils.create_t2t_transformer_encoder(
-            a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
-        )
-
-        return a
-
     def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
 
         shapes, types = train_utils.get_shapes_types(session_data)
@@ -835,28 +861,7 @@ def _pred_entity_graph(self, a, c, mask):
         _, _, pred_ids = self._create_crf(a, sequence_lengths)
         self.tag_prediction = tf.to_int64(pred_ids)
 
-    @staticmethod
-    def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
-        num_features = 0
-        for data in session_data[key]:
-            if data.size > 0:
-                num_features += data[0].shape[-1]
-        return num_features
-
-    def check_input_dimension_consistency(self, session_data: "SessionDataType"):
-        if self.share_hidden_layers:
-            num_text_features = self._get_num_of_features(session_data, "text_features")
-            num_intent_features = self._get_num_of_features(
-                session_data, "intent_features"
-            )
-
-            if num_text_features != num_intent_features:
-                raise ValueError(
-                    "If embeddings are shared "
-                    "text features and label features "
-                    "must coincide. Check the output dimensions of previous components."
-                )
-
+    # train helpers
     def preprocess_train_data(self, training_data: "TrainingData"):
         """Prepares data for training.
 
@@ -890,120 +895,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         return session_data
 
-    @staticmethod
-    def _check_enough_labels(session_data: "SessionDataType") -> bool:
-        return len(np.unique(session_data["intent_ids"])) >= 2
-
-    def train(
-        self,
-        training_data: "TrainingData",
-        cfg: Optional["RasaNLUModelConfig"] = None,
-        **kwargs: Any,
-    ) -> None:
-        """Train the embedding label classifier on a data set."""
-
-        logger.debug("Started training embedding classifier.")
-
-        # set numpy random seed
-        np.random.seed(self.random_seed)
-
-        session_data = self.preprocess_train_data(training_data)
-
-        if self.intent_classification:
-            possible_to_train = self._check_enough_labels(session_data)
-
-            if not possible_to_train:
-                logger.error(
-                    "Can not train a classifier. "
-                    "Need at least 2 different classes. "
-                    "Skipping training of classifier."
-                )
-                return
-
-        if self.evaluate_on_num_examples:
-            session_data, eval_session_data = train_utils.train_val_split(
-                session_data,
-                self.evaluate_on_num_examples,
-                self.random_seed,
-                label_key="intent_ids",
-            )
-        else:
-            eval_session_data = None
-
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed
-            tf.set_random_seed(self.random_seed)
-
-            # allows increasing batch size
-            batch_size_in = tf.placeholder(tf.int64)
-
-            (
-                self._iterator,
-                train_init_op,
-                eval_init_op,
-            ) = train_utils.create_iterator_init_datasets(
-                session_data,
-                eval_session_data,
-                batch_size_in,
-                self.batch_in_strategy,
-                label_key="intent_ids",
-            )
-
-            self._is_training = tf.placeholder_with_default(False, shape=())
-
-            metrics = self._build_tf_train_graph(session_data)
-
-            # define which optimizer to use
-            loss = tf.add_n(metrics["loss"])
-            self._train_op = tf.train.AdamOptimizer().minimize(loss)
-
-            # train tensorflow graph
-            self.session = tf.Session(config=self._tf_config)
-
-            train_utils.train_tf_dataset(
-                train_init_op,
-                eval_init_op,
-                batch_size_in,
-                metrics,
-                self._train_op,
-                self.session,
-                self._is_training,
-                self.epochs,
-                self.batch_in_size,
-                self.evaluate_on_num_examples,
-                self.evaluate_every_num_epochs,
-            )
-
-            # rebuild the graph for prediction
-            self._build_tf_pred_graph(session_data)
-
-            self.attention_weights = train_utils.extract_attention(
-                self.attention_weights
-            )
-
     # process helpers
-    # noinspection PyPep8Naming
-    def _calculate_message_sim(
-        self, batch: Tuple[np.ndarray]
-    ) -> Tuple[np.ndarray, List[float]]:
-        """Calculate message similarities"""
-
-        message_sim = self.session.run(
-            self.intent_prediction,
-            feed_dict={
-                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
-            },
-        )
-
-        message_sim = message_sim.flatten()  # sim is a matrix
-
-        label_ids = message_sim.argsort()[::-1]
-        message_sim[::-1].sort()
-
-        # transform sim to python list for JSON serializing
-        return label_ids, message_sim.tolist()
-
     def predict_label(
         self, message: "Message"
     ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
@@ -1044,6 +936,26 @@ def predict_label(
 
         return label, label_ranking
 
+    def _calculate_message_sim(
+        self, batch: Tuple[np.ndarray]
+    ) -> Tuple[np.ndarray, List[float]]:
+        """Calculate message similarities"""
+
+        message_sim = self.session.run(
+            self.intent_prediction,
+            feed_dict={
+                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
+            },
+        )
+
+        message_sim = message_sim.flatten()  # sim is a matrix
+
+        label_ids = message_sim.argsort()[::-1]
+        message_sim[::-1].sort()
+
+        # transform sim to python list for JSON serializing
+        return label_ids, message_sim.tolist()
+
     def predict_entities(self, message: "Message") -> List[Dict]:
         if self.session is None:
             logger.error(
@@ -1103,6 +1015,95 @@ def _convert_tags_to_entities(
 
         return entities
 
+    # methods to overwrite
+    def train(
+        self,
+        training_data: "TrainingData",
+        cfg: Optional["RasaNLUModelConfig"] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Train the embedding label classifier on a data set."""
+
+        logger.debug("Started training embedding classifier.")
+
+        # set numpy random seed
+        np.random.seed(self.random_seed)
+
+        session_data = self.preprocess_train_data(training_data)
+
+        if self.intent_classification:
+            possible_to_train = self._check_enough_labels(session_data)
+
+            if not possible_to_train:
+                logger.error(
+                    "Can not train a classifier. "
+                    "Need at least 2 different classes. "
+                    "Skipping training of classifier."
+                )
+                return
+
+        if self.evaluate_on_num_examples:
+            session_data, eval_session_data = train_utils.train_val_split(
+                session_data,
+                self.evaluate_on_num_examples,
+                self.random_seed,
+                label_key="intent_ids",
+            )
+        else:
+            eval_session_data = None
+
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            # set random seed
+            tf.set_random_seed(self.random_seed)
+
+            # allows increasing batch size
+            batch_size_in = tf.placeholder(tf.int64)
+
+            (
+                self._iterator,
+                train_init_op,
+                eval_init_op,
+            ) = train_utils.create_iterator_init_datasets(
+                session_data,
+                eval_session_data,
+                batch_size_in,
+                self.batch_in_strategy,
+                label_key="intent_ids",
+            )
+
+            self._is_training = tf.placeholder_with_default(False, shape=())
+
+            metrics = self._build_tf_train_graph(session_data)
+
+            # define which optimizer to use
+            loss = tf.add_n(metrics["loss"])
+            self._train_op = tf.train.AdamOptimizer().minimize(loss)
+
+            # train tensorflow graph
+            self.session = tf.Session(config=self._tf_config)
+
+            train_utils.train_tf_dataset(
+                train_init_op,
+                eval_init_op,
+                batch_size_in,
+                metrics,
+                self._train_op,
+                self.session,
+                self._is_training,
+                self.epochs,
+                self.batch_in_size,
+                self.evaluate_on_num_examples,
+                self.evaluate_every_num_epochs,
+            )
+
+            # rebuild the graph for prediction
+            self._build_tf_pred_graph(session_data)
+
+            self.attention_weights = train_utils.extract_attention(
+                self.attention_weights
+            )
+
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 

From d816a6a5eb5701037cefd2b60b93c0e3ffa46c52 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 11:56:45 +0100
Subject: [PATCH 011/633] run intent classification indepent from tags

---
 .../embedding_intent_classifier.py            | 34 +++++++++++--------
 .../selectors/embedding_response_selector.py  | 21 ++++++++++++
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ed612915a049..2b2badd416e9 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -555,14 +555,14 @@ def _create_session_data(
             if label_attribute and e.get(label_attribute):
                 label_ids.append(label_id_dict[e.get(label_attribute)])
 
-            if tag_id_dict:
+            if self.named_entity_recognition and tag_id_dict:
                 _tags = []
                 for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
                     _tag = determine_token_labels(
                         t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
                     )
                     _tags.append(tag_id_dict[_tag])
-                tag_ids.append(scipy.sparse.coo_matrix(np.array([_tags]).T))
+                tag_ids.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -576,6 +576,7 @@ def _create_session_data(
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
         self._add_to_session_data(session_data, "tag_ids", [tag_ids])
+        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
 
         if label_attribute and (
             "intent_features" not in session_data or not session_data["intent_features"]
@@ -583,9 +584,6 @@ def _create_session_data(
             # no label features are present, get default features from _label_data
             session_data["intent_features"] = self.use_default_label_features(label_ids)
 
-        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
-        self._add_mask_to_session_data(session_data, "intent_mask", "intent_features")
-
         return session_data
 
     # tf helpers:
@@ -679,20 +677,24 @@ def _build_tf_train_graph(
         label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
 
         a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
-        c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
-        all_bs = self.combine_sparse_dense_features(
-            label_data["intent_features"], "intent"
-        )
         mask = batch_data["text_mask"][0]
 
         # transformer
         a = self._create_tf_sequence(a, mask)
 
         if self.intent_classification:
+            b = self.combine_sparse_dense_features(
+                batch_data["intent_features"], "intent"
+            )
+            all_bs = self.combine_sparse_dense_features(
+                label_data["intent_features"], "intent"
+            )
+
             return self._train_intent_graph(a, b, all_bs, mask)
 
         if self.named_entity_recognition:
+            c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
+
             return self._train_entity_graph(a, c, mask)
 
     def _train_entity_graph(self, a, c, mask):
@@ -807,18 +809,22 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
         )
 
         a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
-        c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
         mask = batch_data["text_mask"][0]
 
-        self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
-
         # transformer
         a = self._create_tf_sequence(a, mask)
 
         if self.intent_classification:
+            b = self.combine_sparse_dense_features(
+                batch_data["intent_features"], "intent"
+            )
+            self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
+
             return self._pred_intent_graph(a, b, mask)
+
         if self.named_entity_recognition:
+            c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
+
             return self._pred_entity_graph(a, c, mask)
 
     def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 9e1bf79d54d9..5643ecbedfcc 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -54,8 +54,21 @@ class ResponseSelector(EmbeddingIntentClassifier):
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
         "hidden_layers_sizes_b": [256, 128],
+        # sizes of hidden layers before the embedding layer for tag labels
+        # the number of hidden layers is thus equal to the length of this list
+        "hidden_layers_sizes_c": [],
         # Whether to share the hidden layer weights between input words and intent labels
         "share_hidden_layers": False,
+        # number of units in transformer
+        "transformer_size": 128,
+        # number of transformer layers
+        "num_transformer_layers": 1,
+        # number of attention heads in transformer
+        "num_heads": 4,
+        # type of positional encoding in transformer
+        "pos_encoding": "timing",  # string 'timing' or 'emb'
+        # max sequence length if pos_encoding='emb'
+        "max_seq_length": 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -95,6 +108,8 @@ class ResponseSelector(EmbeddingIntentClassifier):
         "C_emb": 0.8,
         # dropout rate for rnn
         "droprate": 0.2,
+        # use a unidirectional or bidirectional encoder
+        "unidirectional_encoder": True,
         # visualization of accuracy
         # how often to calculate training accuracy
         "evaluate_every_num_epochs": 20,  # small values may hurt performance
@@ -103,6 +118,12 @@ class ResponseSelector(EmbeddingIntentClassifier):
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
+        # if true intent classification is trained and intent predicted
+        "intent_classification": True,
+        # if true named entity recognition is trained and entities predicted
+        "named_entity_recognition": False,
+        # number of entity tags
+        "num_tags": 0,
     }
     # end default properties (DOC MARKER - don't remove)
 

From 96d526c3439666d13f115b107adbcd7d2cc13454 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 12:51:43 +0100
Subject: [PATCH 012/633] entity prediction #1

---
 .../embedding_intent_classifier.py            | 93 +++++++++++--------
 rasa/utils/train_utils.py                     |  1 +
 2 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2b2badd416e9..f3f95b20da50 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,4 +1,5 @@
 import logging
+from collections import defaultdict
 
 import numpy as np
 import os
@@ -135,7 +136,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # if true intent classification is trained and intent predicted
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
-        "named_entity_recognition": False,
+        "named_entity_recognition": True,
         # number of entity tags
         "num_tags": 0,
     }
@@ -682,6 +683,8 @@ def _build_tf_train_graph(
         # transformer
         a = self._create_tf_sequence(a, mask)
 
+        train_output = defaultdict(list)
+
         if self.intent_classification:
             b = self.combine_sparse_dense_features(
                 batch_data["intent_features"], "intent"
@@ -690,19 +693,48 @@ def _build_tf_train_graph(
                 label_data["intent_features"], "intent"
             )
 
-            return self._train_intent_graph(a, b, all_bs, mask)
+            intent_output = self._train_intent_graph(a, b, all_bs, mask)
+            for k, v in intent_output.items():
+                train_output[k].append(v)
 
         if self.named_entity_recognition:
             c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
 
-            return self._train_entity_graph(a, c, mask)
+            entity_output = self._train_entity_graph(a, c, mask)
+            for k, v in entity_output.items():
+                train_output[k].append(v)
+
+        return train_output
 
-    def _train_entity_graph(self, a, c, mask):
+    def _train_entity_graph(self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"):
         sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
+        if len(sequence_lengths.shape) > 1:
+            sequence_lengths = tf.squeeze(sequence_lengths)
+        sequence_lengths.set_shape([mask.shape[0]])
 
         c = tf.reduce_sum(tf.nn.relu(c), -1)
+        c = tf.cast(c, tf.int32)
+
+        # tensor shapes
+        # a: tensor(batch-size, max-seq-len, dim)
+        # sequence_lengths: tensor(batch-size)
+        # c: (batch-size, max-seq-len)
+
+        # CRF
+        crf_params, logits, pred_ids = self._create_crf(a, sequence_lengths)
+
+        # Loss
+        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
+            logits, c, sequence_lengths, crf_params
+        )
+        loss = tf.reduce_mean(-log_likelihood)
+
+        # calculate f1 score for train predictions
+        weights = tf.sequence_mask(sequence_lengths)
+        pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
+        metric = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
 
-        return self._calculate_crf_loss(a, sequence_lengths, c)
+        return {"loss": [loss], "f1": [metric]}
 
     def _train_intent_graph(
         self, a: "tf.Tensor", b: "tf.Tensor", all_bs: "tf.Tensor", mask: "tf.Tensor"
@@ -753,32 +785,6 @@ def _train_intent_graph(
 
         return {"loss": [loss], "acc": [acc]}
 
-    def _calculate_crf_loss(
-        self, inputs: tf.Tensor, sequence_lengths: tf.Tensor, tag_indices: tf.Tensor
-    ):
-        """
-        Args:
-            inputs: tensor (batch-size, max-sequence-length, dimension)
-            sequence_lengths: tensor (batch-size)
-            tag_indices: (batch-size, max-sequence-length)
-        """
-        # CRF
-        crf_params, logits, pred_ids = self._create_crf(inputs, sequence_lengths)
-
-        # Loss
-        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
-            logits, tag_indices, sequence_lengths, crf_params
-        )
-        loss = tf.reduce_mean(-log_likelihood)
-
-        pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
-
-        # Metrics
-        weights = tf.sequence_mask(sequence_lengths)
-        metric = f1(tag_indices, pred_ids, self.num_tags, pos_tag_indices, weights)
-
-        return loss, metric
-
     def _create_crf(
         self, input: tf.Tensor, sequence_lengths: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
@@ -823,9 +829,7 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
             return self._pred_intent_graph(a, b, mask)
 
         if self.named_entity_recognition:
-            c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
-
-            return self._pred_entity_graph(a, c, mask)
+            return self._pred_entity_graph(a, mask)
 
     def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
         last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
@@ -859,9 +863,11 @@ def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
             self.sim_all, self.similarity_type
         )
 
-    def _pred_entity_graph(self, a, c, mask):
-        # get sequence lengths for NER
+    def _pred_entity_graph(self, a: "tf.Tensor", mask: "tf.Tensor"):
         sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
+        if len(sequence_lengths.shape) > 1:
+            sequence_lengths = tf.squeeze(sequence_lengths)
+        sequence_lengths.set_shape([mask.shape[0]])
 
         # predict tagsx
         _, _, pred_ids = self._create_crf(a, sequence_lengths)
@@ -970,13 +976,20 @@ def predict_entities(self, message: "Message") -> List[Dict]:
                 "didn't receive enough training data"
             )
         else:
-            # get features (bag of words) for a message
-            # noinspection PyPep8Naming
-            X = self._extract_features(message)
+            # create session data from message and convert it into a batch of 1
+            session_data = self._create_session_data([message])
+            batch = train_utils.prepare_batch(
+                session_data, tuple_sizes=self.batch_tuple_sizes
+            )
 
             # load tf graph and session
             predictions = self.session.run(
-                self.entity_prediction, feed_dict={self.a_in: X}
+                self.entity_prediction,
+                feed_dict={
+                    _x_in: _x
+                    for _x_in, _x in zip(self.batch_in, batch)
+                    if _x is not None
+                },
             )
 
             tags = [self.inverted_tag_dict[p] for p in predictions[0]]
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 40e237e99884..aec63899331d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1159,6 +1159,7 @@ def train_tf_dataset(
     """Train tf graph"""
 
     session.run(tf.global_variables_initializer())
+    session.run(tf.local_variables_initializer())
 
     if evaluate_on_num_examples:
         logger.info(

From 1e44bc3865b9301f4d4f23dcbef6e840af30c9e9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 13:06:13 +0100
Subject: [PATCH 013/633] fix return values of train

---
 .../embedding_intent_classifier.py            | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a82b93336aa8..c8e74b0051d5 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -136,7 +136,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # if true intent classification is trained and intent predicted
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
-        "named_entity_recognition": True,
+        "named_entity_recognition": False,
         # number of entity tags
         "num_tags": 0,
     }
@@ -145,6 +145,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
     @staticmethod
     def _check_old_config_variables(config: Dict[Text, Any]) -> None:
         """Config migration warning"""
+
         removed_tokenization_params = [
             "intent_tokenization_flag",
             "intent_split_symbol",
@@ -699,20 +700,22 @@ def _build_tf_train_graph(
                 label_data["intent_features"], "intent"
             )
 
-            intent_output = self._train_intent_graph(a, b, all_bs, mask)
-            for k, v in intent_output.items():
-                train_output[k].append(v)
+            loss, acc = self._train_intent_graph(a, b, all_bs, mask)
+            train_output["loss"].append(loss)
+            train_output["acc"].append(acc)
 
         if self.named_entity_recognition:
             c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
 
-            entity_output = self._train_entity_graph(a, c, mask)
-            for k, v in entity_output.items():
-                train_output[k].append(v)
+            loss, acc = self._train_entity_graph(a, c, mask)
+            train_output["loss"].append(loss)
+            train_output["acc"].append(acc)
 
         return train_output
 
-    def _train_entity_graph(self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"):
+    def _train_entity_graph(
+        self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
         if len(sequence_lengths.shape) > 1:
             sequence_lengths = tf.squeeze(sequence_lengths)
@@ -738,13 +741,13 @@ def _train_entity_graph(self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor")
         # calculate f1 score for train predictions
         weights = tf.sequence_mask(sequence_lengths)
         pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
-        metric = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
+        acc = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
 
-        return {"loss": [loss], "f1": [metric]}
+        return loss, acc[0]
 
     def _train_intent_graph(
         self, a: "tf.Tensor", b: "tf.Tensor", all_bs: "tf.Tensor", mask: "tf.Tensor"
-    ) -> Dict[Text, List["tf.Tensor"]]:
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
 
         # get _cls_ vector for intent classification
@@ -773,7 +776,7 @@ def _train_intent_graph(
             embed_name="intent",
         )
 
-        loss, acc = train_utils.calculate_loss_acc(
+        return train_utils.calculate_loss_acc(
             cls_embed,
             self.label_embed,
             b,
@@ -789,8 +792,6 @@ def _train_intent_graph(
             self.scale_loss,
         )
 
-        return {"loss": [loss], "acc": [acc]}
-
     def _create_crf(
         self, input: tf.Tensor, sequence_lengths: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:

From f7733a718b35481d573382a21e8be7d228fb8e2a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 14:28:05 +0100
Subject: [PATCH 014/633] persist cls_embed tensor

---
 .../embedding_intent_classifier.py            | 47 ++++++++++---------
 rasa/utils/train_utils.py                     |  2 +-
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c8e74b0051d5..7f2a10d371fe 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -136,9 +136,9 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # if true intent classification is trained and intent predicted
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
-        "named_entity_recognition": False,
+        "named_entity_recognition": True,
         # number of entity tags
-        "num_tags": 0,
+        "num_tags": None,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -252,7 +252,7 @@ def __init__(
         intent_prediction: Optional["tf.Tensor"] = None,
         entity_prediction: Optional["tf.Tensor"] = None,
         similarity: Optional["tf.Tensor"] = None,
-        message_embed: Optional["tf.Tensor"] = None,
+        cls_embed: Optional["tf.Tensor"] = None,
         label_embed: Optional["tf.Tensor"] = None,
         all_labels_embed: Optional["tf.Tensor"] = None,
         batch_tuple_sizes: Optional[Dict] = None,
@@ -280,7 +280,7 @@ def __init__(
         self.sim = similarity
 
         # persisted embeddings
-        self.message_embed = message_embed
+        self.cls_embed = cls_embed
         self.label_embed = label_embed
         self.all_labels_embed = all_labels_embed
 
@@ -566,7 +566,8 @@ def _create_session_data(
                         t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
                     )
                     _tags.append(tag_id_dict[_tag])
-                tag_ids.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
+                # transpose to have seq_len x 1
+                tag_ids.append(np.array([_tags]).T)
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -645,6 +646,7 @@ def combine_sparse_dense_features(
 
     def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         """Create sequence level embedding and mask."""
+
         a_in = self._create_tf_embed_fnn(
             a_in,
             self.hidden_layer_sizes["text"],
@@ -751,9 +753,9 @@ def _train_intent_graph(
         last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
 
         # get _cls_ vector for intent classification
-        cls_embed = tf.reduce_sum(a * last, 1)
-        cls_embed = train_utils.create_tf_embed(
-            cls_embed,
+        self.cls_embed = tf.reduce_sum(a * last, 1)
+        self.cls_embed = train_utils.create_tf_embed(
+            self.cls_embed,
             self.embed_dim,
             self.C2,
             self.similarity_type,
@@ -777,7 +779,7 @@ def _train_intent_graph(
         )
 
         return train_utils.calculate_loss_acc(
-            cls_embed,
+            self.cls_embed,
             self.label_embed,
             b,
             self.all_labels_embed,
@@ -796,7 +798,6 @@ def _create_crf(
         self, input: tf.Tensor, sequence_lengths: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         with tf.variable_scope("ner", reuse=tf.AUTO_REUSE):
-
             logits = tf.layers.dense(input, self.num_tags, name="crf-logits")
             crf_params = tf.get_variable(
                 "crf-params", [self.num_tags, self.num_tags], dtype=tf.float32
@@ -833,28 +834,30 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
             )
             self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
-            return self._pred_intent_graph(a, b, mask)
+            self._pred_intent_graph(a, b, mask)
 
         if self.named_entity_recognition:
-            return self._pred_entity_graph(a, mask)
+            self._pred_entity_graph(a, mask)
 
     def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
         last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
 
         # get _cls_ embedding
-        cls_embed = tf.reduce_sum(a * last, 1)
-        cls_embed = train_utils.create_tf_embed(
-            cls_embed,
+        self.cls_embed = tf.reduce_sum(a * last, 1)
+        self.cls_embed = train_utils.create_tf_embed(
+            self.cls_embed,
             self.embed_dim,
             self.C2,
             self.similarity_type,
             layer_name_suffix="cls",
         )
 
-        self.b = tf.reduce_sum(b, 1)
+        b = tf.reduce_sum(b, 1)
 
         self.sim_all = train_utils.tf_raw_sim(
-            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :], None
+            self.cls_embed[:, tf.newaxis, :],
+            self.all_labels_embed[tf.newaxis, :, :],
+            None,
         )
         self.label_embed = self._create_tf_embed_fnn(
             b,
@@ -863,7 +866,7 @@ def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
             embed_name="intent",
         )
         self.sim = train_utils.tf_raw_sim(
-            cls_embed[:, tf.newaxis, :], self.label_embed, None
+            self.cls_embed[:, tf.newaxis, :], self.label_embed, None
         )
 
         self.intent_prediction = train_utils.confidence_from_sim(
@@ -878,7 +881,7 @@ def _pred_entity_graph(self, a: "tf.Tensor", mask: "tf.Tensor"):
 
         # predict tagsx
         _, _, pred_ids = self._create_crf(a, sequence_lengths)
-        self.tag_prediction = tf.to_int64(pred_ids)
+        self.entity_prediction = tf.to_int64(pred_ids)
 
     # train helpers
     def preprocess_train_data(self, training_data: "TrainingData"):
@@ -984,6 +987,7 @@ def predict_entities(self, message: "Message") -> List[Dict]:
             )
         else:
             # create session data from message and convert it into a batch of 1
+            self.num_tags = len(self.inverted_tag_dict)
             session_data = self._create_session_data([message])
             batch = train_utils.prepare_batch(
                 session_data, tuple_sizes=self.batch_tuple_sizes
@@ -1175,7 +1179,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             )
             train_utils.persist_tensor("similarity", self.sim, self.graph)
 
-            train_utils.persist_tensor("message_embed", self.message_embed, self.graph)
+            train_utils.persist_tensor("cls_embed", self.cls_embed, self.graph)
             train_utils.persist_tensor("label_embed", self.label_embed, self.graph)
             train_utils.persist_tensor(
                 "all_labels_embed", self.all_labels_embed, self.graph
@@ -1233,6 +1237,7 @@ def load(
                 batch_in = train_utils.load_tensor("batch_placeholder")
 
                 sim_all = train_utils.load_tensor("similarity_all")
+                cls_embed = train_utils.load_tensor("cls_embed")
                 intent_prediction = train_utils.load_tensor("intent_prediction")
                 entity_prediction = train_utils.load_tensor("entity_prediction")
                 sim = train_utils.load_tensor("similarity")
@@ -1269,7 +1274,7 @@ def load(
                 intent_prediction=intent_prediction,
                 entity_prediction=entity_prediction,
                 similarity=sim,
-                message_embed=message_embed,
+                cls_embed=cls_embed,
                 label_embed=label_embed,
                 all_labels_embed=all_labels_embed,
                 attention_weights=attention_weights,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 89628a8fe1bb..e087c012c9cc 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -364,8 +364,8 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
         coo = [x.tocoo() for x in array_of_sparse]
     else:
         coo = array_of_sparse
-    data = [v for x in array_of_sparse for v in x.data]
 
+    data = [v for x in array_of_sparse for v in x.data]
     indices = [
         ids for i, x in enumerate(coo) for ids in zip([i] * len(x.row), x.row, x.col)
     ]

From db97b1bafafa591d27d91909374f48f872912ad4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 14:41:26 +0100
Subject: [PATCH 015/633] add test

---
 tests/nlu/training/test_train.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 2bc2d24897a3..6e0b2e339755 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -155,6 +155,25 @@ async def test_train_model_no_events(language, pipeline, component_builder, tmpd
     assert loaded.parse("Hello today is Monday, again!") is not None
 
 
+@utilities.slowtest
+async def test_train_model_with_entities(component_builder, tmpdir):
+    _config = utilities.base_test_conf("supervised_embeddings")
+    _config.pipeline.remove({"name": "CRFEntityExtractor"})
+    (trained, _, persisted_path) = await train(
+        _config,
+        path=tmpdir.strpath,
+        data="./data/test/multiple_files_markdown",
+        component_builder=component_builder,
+    )
+    assert trained.pipeline
+    loaded = Interpreter.load(persisted_path, component_builder)
+    assert loaded.pipeline
+    result = loaded.parse("list Italian restaurants")
+    assert result is not None
+    assert result["intent"]["name"] == "restaurant_search"
+    assert len(result["entities"]) == 1
+
+
 async def test_train_model_empty_pipeline(component_builder):
     # Should return an empty pipeline
     _config = utilities.base_test_conf(pipeline_template=None)

From 212656d6995c31ac4681cbf91431ba783d301a4f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 14:44:02 +0100
Subject: [PATCH 016/633] fix entity score

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7f2a10d371fe..7b1e2480f783 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -743,9 +743,9 @@ def _train_entity_graph(
         # calculate f1 score for train predictions
         weights = tf.sequence_mask(sequence_lengths)
         pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
-        acc = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
+        score = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
 
-        return loss, acc[0]
+        return loss, score[1]
 
     def _train_intent_graph(
         self, a: "tf.Tensor", b: "tf.Tensor", all_bs: "tf.Tensor", mask: "tf.Tensor"

From f6a2f0a5a76b1d437fb3b74fe307bda80ed02911 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 15:29:59 +0100
Subject: [PATCH 017/633] fix sequence length

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 9 ++-------
 tests/nlu/training/test_train.py                    | 2 ++
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7b1e2480f783..fa8561b27084 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -718,9 +718,7 @@ def _build_tf_train_graph(
     def _train_entity_graph(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
-        if len(sequence_lengths.shape) > 1:
-            sequence_lengths = tf.squeeze(sequence_lengths)
+        sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
         sequence_lengths.set_shape([mask.shape[0]])
 
         c = tf.reduce_sum(tf.nn.relu(c), -1)
@@ -874,10 +872,7 @@ def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
         )
 
     def _pred_entity_graph(self, a: "tf.Tensor", mask: "tf.Tensor"):
-        sequence_lengths = tf.cast(tf.reduce_sum(mask, 1), tf.int32)
-        if len(sequence_lengths.shape) > 1:
-            sequence_lengths = tf.squeeze(sequence_lengths)
-        sequence_lengths.set_shape([mask.shape[0]])
+        sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
 
         # predict tagsx
         _, _, pred_ids = self._create_crf(a, sequence_lengths)
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 6e0b2e339755..a30a32892303 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -172,6 +172,8 @@ async def test_train_model_with_entities(component_builder, tmpdir):
     assert result is not None
     assert result["intent"]["name"] == "restaurant_search"
     assert len(result["entities"]) == 1
+    assert result["entities"][0]["value"] == "Italian"
+    assert result["entities"][0]["entity"] == "cuisine"
 
 
 async def test_train_model_empty_pipeline(component_builder):

From 9e2ba3f5ccbe7c9e53486bb008b48cc2259212cc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 15:38:27 +0100
Subject: [PATCH 018/633] refactoring

---
 .../embedding_intent_classifier.py            | 45 +++++++++----------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fa8561b27084..33ff0db17a3f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -980,34 +980,33 @@ def predict_entities(self, message: "Message") -> List[Dict]:
                 "component is either not trained or "
                 "didn't receive enough training data"
             )
-        else:
-            # create session data from message and convert it into a batch of 1
-            self.num_tags = len(self.inverted_tag_dict)
-            session_data = self._create_session_data([message])
-            batch = train_utils.prepare_batch(
-                session_data, tuple_sizes=self.batch_tuple_sizes
-            )
+            return []
 
-            # load tf graph and session
-            predictions = self.session.run(
-                self.entity_prediction,
-                feed_dict={
-                    _x_in: _x
-                    for _x_in, _x in zip(self.batch_in, batch)
-                    if _x is not None
-                },
-            )
+        # create session data from message and convert it into a batch of 1
+        self.num_tags = len(self.inverted_tag_dict)
+        session_data = self._create_session_data([message])
+        batch = train_utils.prepare_batch(
+            session_data, tuple_sizes=self.batch_tuple_sizes
+        )
 
-            tags = [self.inverted_tag_dict[p] for p in predictions[0]]
+        # load tf graph and session
+        predictions = self.session.run(
+            self.entity_prediction,
+            feed_dict={
+                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
+            },
+        )
 
-            entities = self._convert_tags_to_entities(
-                message.text, message.get("tokens", []), tags
-            )
+        tags = [self.inverted_tag_dict[p] for p in predictions[0]]
+
+        entities = self._convert_tags_to_entities(
+            message.text, message.get("tokens", []), tags
+        )
 
-            extracted = self.add_extractor_name(entities)
-            entities = message.get("entities", []) + extracted
+        extracted = self.add_extractor_name(entities)
+        entities = message.get("entities", []) + extracted
 
-            return entities
+        return entities
 
     def _convert_tags_to_entities(
         self, text: str, tokens: List[Token], tags: List[Text]

From 9e36be7f4bd9b61ddd6457e4d76727ffcebf89c5 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 16:20:44 +0100
Subject: [PATCH 019/633] refactor train metrics

---
 rasa/core/policies/embedding_policy.py        |  4 +-
 .../embedding_intent_classifier.py            | 22 ++---
 rasa/utils/train_utils.py                     | 93 +++++++++++++------
 3 files changed, 77 insertions(+), 42 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 78a1d75c53f3..91a9eda7f919 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -24,6 +24,8 @@
 import tensorflow as tf
 
 # avoid warning println on contrib import - remove for tf 2
+from utils.train_utils import TrainingMetrics
+
 tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
@@ -483,7 +485,7 @@ def train(
                 train_init_op,
                 eval_init_op,
                 batch_size_in,
-                {"loss": [loss], "acc": [acc]},
+                TrainingMetrics(loss={"loss": loss}, score={"acc": acc}),
                 self._train_op,
                 self.session,
                 self._is_training,
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 33ff0db17a3f..dc3ca3480190 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -16,7 +16,7 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.utils import train_utils
-from rasa.utils.train_utils import SessionDataType
+from rasa.utils.train_utils import SessionDataType, TrainingMetrics
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
@@ -673,9 +673,7 @@ def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         return a
 
     # build tf graphs:
-    def _build_tf_train_graph(
-        self, session_data: SessionDataType
-    ) -> Dict[Text, List["tf.Tensor"]]:
+    def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetrics:
 
         # get in tensors from generator
         self.batch_in = self._iterator.get_next()
@@ -692,7 +690,7 @@ def _build_tf_train_graph(
         # transformer
         a = self._create_tf_sequence(a, mask)
 
-        train_output = defaultdict(list)
+        metrics = TrainingMetrics(loss={}, score={})
 
         if self.intent_classification:
             b = self.combine_sparse_dense_features(
@@ -703,17 +701,17 @@ def _build_tf_train_graph(
             )
 
             loss, acc = self._train_intent_graph(a, b, all_bs, mask)
-            train_output["loss"].append(loss)
-            train_output["acc"].append(acc)
+            metrics.loss["intent_loss"] = loss
+            metrics.score["intent_acc"] = acc
 
         if self.named_entity_recognition:
             c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
 
-            loss, acc = self._train_entity_graph(a, c, mask)
-            train_output["loss"].append(loss)
-            train_output["acc"].append(acc)
+            loss, f1_score = self._train_entity_graph(a, c, mask)
+            metrics.loss["entity_loss"] = loss
+            metrics.score["entity_f1_score"] = f1_score
 
-        return train_output
+        return metrics
 
     def _train_entity_graph(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
@@ -1101,7 +1099,7 @@ def train(
             metrics = self._build_tf_train_graph(session_data)
 
             # define which optimizer to use
-            loss = tf.add_n(metrics["loss"])
+            loss = tf.add_n(list(metrics.loss.values()))
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
             # train tensorflow graph
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 772e418d68de..4b0d36ee8853 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -2,7 +2,18 @@
 import logging
 import scipy.sparse
 import typing
-from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Callable, Any
+from typing import (
+    List,
+    Optional,
+    Text,
+    Dict,
+    Tuple,
+    Union,
+    Generator,
+    Callable,
+    Any,
+    NamedTuple,
+)
 import numpy as np
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
@@ -29,6 +40,12 @@
 SessionDataType = Dict[Text, List[np.ndarray]]
 
 
+# namedtuple for training metrics
+class TrainingMetrics(NamedTuple):
+    loss: Dict[Text, tf.Tensor]
+    score: Dict[Text, tf.Tensor]
+
+
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
     """Prepare `tf.compat.v1.ConfigProto` for training"""
 
@@ -1138,30 +1155,35 @@ def linearly_increasing_batch_size(
 
 def output_validation_stat(
     eval_init_op: "tf.Operation",
-    metrics: Dict[Text, List["tf.Tensor"]],
+    metrics: TrainingMetrics,
     session: "tf.Session",
     is_training: "tf.Session",
     batch_size_in: "tf.Tensor",
     ep_batch_size: int,
-) -> Dict[Text, List[float]]:
+) -> TrainingMetrics:
     """Output training statistics"""
 
     session.run(eval_init_op, feed_dict={batch_size_in: ep_batch_size})
-    ep_val_metrics = {k: [0] * len(v) for k, v in metrics.items()}
+    ep_val_metrics = TrainingMetrics(
+        loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
+    )
     batches_per_epoch = 0
     while True:
         try:
             batch_val_metrics = session.run([metrics], feed_dict={is_training: False})
             batches_per_epoch += 1
-            for k, values in batch_val_metrics.items():
-                for i, v in enumerate(values):
-                    ep_val_metrics[k][i] += v
+            for name, value in batch_val_metrics.loss.items():
+                ep_val_metrics.loss[name] += value
+            for name, value in batch_val_metrics.score.items():
+                ep_val_metrics.score[name] += value
+
         except tf.errors.OutOfRangeError:
             break
 
-    for k, values in ep_val_metrics.items():
-        for i, v in enumerate(values):
-            ep_val_metrics[k][i] = v / batches_per_epoch
+    for name, value in ep_val_metrics.loss.items():
+        ep_val_metrics.loss[name] = value / batches_per_epoch
+    for name, value in ep_val_metrics.score.items():
+        ep_val_metrics.score[name] = value / batches_per_epoch
 
     return ep_val_metrics
 
@@ -1170,7 +1192,7 @@ def train_tf_dataset(
     train_init_op: "tf.Operation",
     eval_init_op: "tf.Operation",
     batch_size_in: "tf.Tensor",
-    metrics: Dict[Text, List[tf.Tensor]],
+    metrics: TrainingMetrics,
     train_op: "tf.Tensor",
     session: "tf.Session",
     is_training: "tf.Session",
@@ -1191,37 +1213,42 @@ def train_tf_dataset(
         )
     pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
 
-    train_metrics = {k: [0] * len(v) for k, v in metrics.items()}
-    val_metrics = {k: [0] * len(v) for k, v in metrics.items()}
+    train_metrics = TrainingMetrics(
+        loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
+    )
+    val_metrics = TrainingMetrics(
+        loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
+    )
     for ep in pbar:
 
         ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
 
         session.run(train_init_op, feed_dict={batch_size_in: ep_batch_size})
 
-        ep_train_metrics = {k: [0] * len(v) for k, v in metrics.items()}
+        ep_train_metrics = TrainingMetrics(
+            loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
+        )
         batches_per_epoch = 0
         while True:
             try:
-                _, batch_train_metrics = session.run(
+                _, batch_train_metric = session.run(
                     [train_op, metrics], feed_dict={is_training: True}
                 )
                 batches_per_epoch += 1
-                for k, values in batch_train_metrics.items():
-                    for i, v in enumerate(values):
-                        ep_train_metrics[k][i] += v
+                for name, value in batch_train_metric.loss.items():
+                    ep_train_metrics.loss[name] += value
+                for name, value in batch_train_metric.score.items():
+                    ep_train_metrics.score[name] += value
 
             except tf.errors.OutOfRangeError:
                 break
 
-        for k, values in ep_train_metrics.items():
-            for i, v in enumerate(values):
-                train_metrics[k][i] = v / batches_per_epoch
+        for name, value in ep_train_metrics.loss.items():
+            train_metrics.loss[name] = value / batches_per_epoch
+        for name, value in ep_train_metrics.score.items():
+            train_metrics.score[name] = value / batches_per_epoch
 
-        postfix_dict = {}
-        for k, values in train_metrics.items():
-            for i, v in enumerate(values):
-                postfix_dict[f"{k}_{i}"] = f"{v:.3f}"
+        postfix_dict = _create_postfix_dict(train_metrics)
 
         if eval_init_op is not None:
             if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
@@ -1234,16 +1261,24 @@ def train_tf_dataset(
                     ep_batch_size,
                 )
 
-            postfix_dict = {}
-            for k, values in val_metrics:
-                for i, v in enumerate(values):
-                    postfix_dict[f"val_{k}_{i}"] = f"{v:.3f}"
+            postfix_dict = _create_postfix_dict(val_metrics, "val_")
 
         pbar.set_postfix(postfix_dict)
 
     logger.info("Finished training.")
 
 
+def _create_postfix_dict(
+    metrics: TrainingMetrics, prefix: Text = ""
+) -> Dict[Text, Text]:
+    postfix_dict = {}
+    for name, value in metrics.loss.items():
+        postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
+    for name, value in metrics.score.items():
+        postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
+    return postfix_dict
+
+
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
     """Extract attention probabilities from t2t dict"""
 

From 696e7437c3fa9c8f10a35c86cdcf9eb8fbeb06f1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 16:54:48 +0100
Subject: [PATCH 020/633] fix training output

---
 .../embedding_intent_classifier.py            |  8 ++++----
 rasa/utils/train_utils.py                     | 19 ++++++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index dc3ca3480190..e298ca22c797 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -701,15 +701,15 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
             )
 
             loss, acc = self._train_intent_graph(a, b, all_bs, mask)
-            metrics.loss["intent_loss"] = loss
-            metrics.score["intent_acc"] = acc
+            metrics.loss["i_loss"] = loss
+            metrics.score["i_acc"] = acc
 
         if self.named_entity_recognition:
             c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
 
             loss, f1_score = self._train_entity_graph(a, c, mask)
-            metrics.loss["entity_loss"] = loss
-            metrics.score["entity_f1_score"] = f1_score
+            metrics.loss["e_loss"] = loss
+            metrics.score["e_f1"] = f1_score
 
         return metrics
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 4b0d36ee8853..bc06062fbc55 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1171,6 +1171,7 @@ def output_validation_stat(
     while True:
         try:
             batch_val_metrics = session.run([metrics], feed_dict={is_training: False})
+            batch_val_metrics = batch_val_metrics[0]
             batches_per_epoch += 1
             for name, value in batch_val_metrics.loss.items():
                 ep_val_metrics.loss[name] += value
@@ -1213,12 +1214,8 @@ def train_tf_dataset(
         )
     pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
 
-    train_metrics = TrainingMetrics(
-        loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
-    )
-    val_metrics = TrainingMetrics(
-        loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
-    )
+    train_metrics = TrainingMetrics(loss={}, score={})
+    val_metrics = TrainingMetrics(loss={}, score={})
     for ep in pbar:
 
         ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
@@ -1248,7 +1245,8 @@ def train_tf_dataset(
         for name, value in ep_train_metrics.score.items():
             train_metrics.score[name] = value / batches_per_epoch
 
-        postfix_dict = _create_postfix_dict(train_metrics)
+        postfix_dict = {}
+        postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
 
         if eval_init_op is not None:
             if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
@@ -1261,17 +1259,16 @@ def train_tf_dataset(
                     ep_batch_size,
                 )
 
-            postfix_dict = _create_postfix_dict(val_metrics, "val_")
+            postfix_dict = _update_postfix_dict(postfix_dict, val_metrics, "val_")
 
         pbar.set_postfix(postfix_dict)
 
     logger.info("Finished training.")
 
 
-def _create_postfix_dict(
-    metrics: TrainingMetrics, prefix: Text = ""
+def _update_postfix_dict(
+    postfix_dict: Dict[Text, Text], metrics: TrainingMetrics, prefix: Text = ""
 ) -> Dict[Text, Text]:
-    postfix_dict = {}
     for name, value in metrics.loss.items():
         postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
     for name, value in metrics.score.items():

From fbb24ee82fcc153a2961d7a58f089fc9dd71dc8c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 15 Nov 2019 02:07:49 +0100
Subject: [PATCH 021/633] added batch support

---
 .../pretrained_lm_featurizer.py               | 123 +++++++++++++++++-
 1 file changed, 119 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index 165831576ea9..390bd334a4a0 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -82,6 +82,7 @@ def _load_transformers_params(self):
         )
         self.model = model_dictionary[self.model_key].from_pretrained(self.model_key)
         self.contains_special_token = special_tokens_present[self.model_key]
+        self.model.eval()
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
@@ -96,9 +97,115 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        for example in training_data.intent_examples:
-            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
-                self._set_lm_features(example, attribute)
+        bs = 128
+
+        for attribute in [MESSAGE_TEXT_ATTRIBUTE]:
+
+            start_index = 0
+
+            while start_index < len(training_data.intent_examples):
+
+                end_index = min(start_index + bs, len(training_data.intent_examples))
+                batch_examples = training_data.intent_examples[start_index:end_index]
+                batch_text = [ex.get(attribute) for ex in batch_examples]
+
+                batch_feats = self._compute_features(batch_text)
+
+                for index, ex in enumerate(batch_examples):
+
+                    ex.set(
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                        self._combine_with_existing_dense_features(
+                            ex,
+                            batch_feats[index],
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                        ),
+                    )
+
+                start_index += bs
+
+        # for example in training_data.intent_examples:
+        #     for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+        #         self._set_lm_features(example, attribute)
+
+    def _compute_input_ids(self, batch_examples):
+
+        batch_input_ids = []
+        max_seq_len = 0
+        actual_seq_lengths = []
+        for example in batch_examples:
+
+            example_input_ids = self.tokenizer.encode(
+                example, add_special_tokens=self.contains_special_token
+            )
+            max_seq_len = max(max_seq_len, len(example_input_ids))
+            actual_seq_lengths.append(len(example_input_ids))
+            batch_input_ids.append(example_input_ids)
+
+        # add padding
+        padded_input_ids = []
+        for example_input_ids in batch_input_ids:
+            padded_input_ids.append(
+                example_input_ids
+                + [self.tokenizer.pad_token_id] * (max_seq_len - len(example_input_ids))
+            )
+
+        return torch.tensor(padded_input_ids), actual_seq_lengths
+
+    def _compute_attention_mask(self, actual_seq_lengths):
+
+        attention_mask = []
+        max_seq_length = max(actual_seq_lengths)
+        for index in range(len(actual_seq_lengths)):
+            example_seq_length = actual_seq_lengths[index]
+            attention_mask.append(
+                [1] * example_seq_length + [0] * (max_seq_length - example_seq_length)
+            )
+
+        attention_mask = np.array(attention_mask).astype(np.float32)
+
+        return torch.tensor(attention_mask)
+
+    def _compute_features(self, batch_inputs):
+
+        batch_model_inputs, actual_seq_lengths = self._compute_input_ids(batch_inputs)
+        batch_attention_mask = self._compute_attention_mask(actual_seq_lengths)
+
+        with torch.no_grad():
+            last_hidden_states = self.model(
+                batch_model_inputs, attention_mask=batch_attention_mask
+            )[
+                0
+            ].numpy()  # Models outputs are now numpy array
+            sequence_embedding = last_hidden_states  # First element of batch
+
+            truncated_embeds = self._resolve_special_tokens_in_embeddings(
+                sequence_embedding, actual_seq_lengths
+            )
+
+            return truncated_embeds
+
+    def _resolve_special_tokens_in_embeddings(self, embeddings, actual_seq_lengths):
+
+        truncated_embeds = []
+        for index, embedding in enumerate(embeddings):
+            unmasked_embedding = embedding[: actual_seq_lengths[index]]
+
+            if self.contains_special_token:
+                # dim - (seq + 2, hdim)
+                # Discard SEP token and move CLS token to last index
+                unmasked_embedding = unmasked_embedding[:-1, :]  # Discard SEP
+                unmasked_embedding = np.roll(
+                    unmasked_embedding, -1, axis=0
+                )  # Move CLS to back
+            else:
+                unmasked_embedding = np.concatenate(
+                    [unmasked_embedding, np.zeros((1, unmasked_embedding.shape[-1]))],
+                    axis=0,
+                )
+            truncated_embeds.append(unmasked_embedding)
+
+        return np.array(truncated_embeds)
 
     def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
@@ -144,4 +251,12 @@ def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        self._set_lm_features(message)
+        feats = self._compute_features([message.get(MESSAGE_TEXT_ATTRIBUTE)])
+        message.set(
+            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            self._combine_with_existing_dense_features(
+                message,
+                feats[0],
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            ),
+        )

From 61112317c37fed1ae9e12a727c68ded35c97e656 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 10:25:21 +0100
Subject: [PATCH 022/633] remove duplicated dense layer for a_in

---
 .../embedding_intent_classifier.py            |  8 ++++---
 rasa/utils/train_utils.py                     | 23 ++++++++-----------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e298ca22c797..6d3a53d9faaf 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -647,11 +647,13 @@ def combine_sparse_dense_features(
     def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         """Create sequence level embedding and mask."""
 
-        a_in = self._create_tf_embed_fnn(
+        a_in = train_utils.create_tf_fnn(
             a_in,
             self.hidden_layer_sizes["text"],
-            fnn_name="text_intent" if self.share_hidden_layers else "text",
-            embed_name="text",
+            self.droprate,
+            self.C2,
+            self._is_training,
+            layer_name_suffix="text_intent" if self.share_hidden_layers else "text",
         )
 
         self.attention_weights = {}
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index bc06062fbc55..7a2c4f5d7065 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -715,8 +715,12 @@ def create_t2t_transformer_encoder(
     is_training: "tf.Tensor",
 ) -> "tf.Tensor":
     """Create t2t transformer encoder."""
-
     with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
+        if len(mask.shape) == 2:
+            _mask = tf.expand_dims(mask, -1)
+        else:
+            _mask = mask
+
         x = create_tf_fnn(
             x_in,
             [hparams.hidden_size],
@@ -733,20 +737,14 @@ def create_t2t_transformer_encoder(
         if hparams.multiply_embedding_mode == "sqrt_depth":
             x *= hparams.hidden_size ** 0.5
 
-        if len(mask.shape) == 2:
-            x *= tf.expand_dims(mask, -1)
-        else:
-            x *= mask
+        x *= _mask
         (
             x,
             self_attention_bias,
             encoder_decoder_attention_bias,
         ) = transformer_prepare_encoder(x, None, hparams)
 
-        if len(mask.shape) == 2:
-            x *= tf.expand_dims(mask, -1)
-        else:
-            x *= mask
+        x *= _mask
 
         x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
 
@@ -759,15 +757,12 @@ def create_t2t_transformer_encoder(
             x,
             self_attention_bias,
             hparams,
-            nonpadding=mask,
+            nonpadding=_mask,
             save_weights_to=attention_weights,
             attn_bias_for_padding=attn_bias_for_padding,
         )
 
-        if len(mask.shape) == 2:
-            x *= tf.expand_dims(mask, -1)
-        else:
-            x *= mask
+        x *= _mask
 
         return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
 

From a571a8d26a42f5492d88d6a2c11ec75cb94f2abd Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 15 Nov 2019 11:55:53 +0100
Subject: [PATCH 023/633] add regularizer for entities

---
 rasa/core/policies/embedding_policy.py        |  4 +--
 .../embedding_intent_classifier.py            | 29 +++++++------------
 rasa/utils/train_utils.py                     | 10 +++++--
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 91a9eda7f919..306a17e66118 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -286,7 +286,7 @@ def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
             layer_name_suffix="bot",
         )
         return train_utils.create_tf_embed(
-            b, self.embed_dim, self.C2, self.similarity_type, layer_name_suffix="bot"
+            b, self.embed_dim, self.C2, "bot", self.similarity_type
         )
 
     def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
@@ -326,7 +326,7 @@ def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
             mask = mask[:, -1:]
 
         dial_embed = train_utils.create_tf_embed(
-            a, self.embed_dim, self.C2, self.similarity_type, layer_name_suffix="dial"
+            a, self.embed_dim, self.C2, "dial", self.similarity_type
         )
 
         return dial_embed, mask
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6d3a53d9faaf..5b5e9fb4e7e2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -614,11 +614,7 @@ def _create_tf_embed_fnn(
             layer_name_suffix=fnn_name,
         )
         return train_utils.create_tf_embed(
-            x,
-            self.embed_dim,
-            self.C2,
-            self.similarity_type,
-            layer_name_suffix=embed_name,
+            x, self.embed_dim, self.C2, embed_name, self.similarity_type
         )
 
     def combine_sparse_dense_features(
@@ -753,11 +749,7 @@ def _train_intent_graph(
         # get _cls_ vector for intent classification
         self.cls_embed = tf.reduce_sum(a * last, 1)
         self.cls_embed = train_utils.create_tf_embed(
-            self.cls_embed,
-            self.embed_dim,
-            self.C2,
-            self.similarity_type,
-            layer_name_suffix="cls",
+            self.cls_embed, self.embed_dim, self.C2, "cls", self.similarity_type
         )
 
         b = tf.reduce_sum(tf.nn.relu(b), 1)
@@ -796,9 +788,14 @@ def _create_crf(
         self, input: tf.Tensor, sequence_lengths: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         with tf.variable_scope("ner", reuse=tf.AUTO_REUSE):
-            logits = tf.layers.dense(input, self.num_tags, name="crf-logits")
+            logits = train_utils.create_tf_embed(
+                input, self.num_tags, self.C2, "crf-logits"
+            )
             crf_params = tf.get_variable(
-                "crf-params", [self.num_tags, self.num_tags], dtype=tf.float32
+                "crf-params",
+                [self.num_tags, self.num_tags],
+                dtype=tf.float32,
+                regularizer=tf.contrib.layers.l2_regularizer(self.C2),
             )
             pred_ids, _ = tf.contrib.crf.crf_decode(
                 logits, crf_params, sequence_lengths
@@ -806,7 +803,7 @@ def _create_crf(
 
             return crf_params, logits, pred_ids
 
-    def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
+    def _build_tf_pred_graph(self, session_data: "SessionDataType"):
 
         shapes, types = train_utils.get_shapes_types(session_data)
 
@@ -843,11 +840,7 @@ def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
         # get _cls_ embedding
         self.cls_embed = tf.reduce_sum(a * last, 1)
         self.cls_embed = train_utils.create_tf_embed(
-            self.cls_embed,
-            self.embed_dim,
-            self.C2,
-            self.similarity_type,
-            layer_name_suffix="cls",
+            self.cls_embed, self.embed_dim, self.C2, "cls", self.similarity_type
         )
 
         b = tf.reduce_sum(b, 1)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 7a2c4f5d7065..a7b7028fcad1 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -648,8 +648,8 @@ def create_tf_embed(
     x: "tf.Tensor",
     embed_dim: int,
     C2: float,
-    similarity_type: Text,
     layer_name_suffix: Text,
+    similarity_type: Optional[Text] = None,
 ) -> "tf.Tensor":
     """Create dense embedding layer with a name."""
 
@@ -662,8 +662,12 @@ def create_tf_embed(
         name=f"embed_layer_{layer_name_suffix}",
         reuse=tf.AUTO_REUSE,
     )
-    # normalize embedding vectors for cosine similarity
-    return tf_normalize_if_cosine(embed_x, similarity_type)
+
+    if similarity_type:
+        # normalize embedding vectors for cosine similarity
+        return tf_normalize_if_cosine(embed_x, similarity_type)
+
+    return embed_x
 
 
 def create_t2t_hparams(

From cce2758fa8161d40ce1d2f4f687b85d5688e6031 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 12:57:03 +0100
Subject: [PATCH 024/633] exclude lm featurizer from test.

---
 .../dense_featurizer/pretrained_lm_featurizer.py       | 10 +---------
 tests/nlu/training/test_train.py                       |  3 +++
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index 390bd334a4a0..11214a1f9538 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -1,19 +1,11 @@
 import logging
-import os
-import re
-import scipy.sparse
-from typing import Any, Dict, List, Optional, Text
-from rasa.nlu import utils
+from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
-from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_INTENT_ATTRIBUTE,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 import torch
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index a30a32892303..41c1d8a15dc6 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -67,6 +67,9 @@ def test_all_components_are_in_at_least_one_test_pipeline():
 
     all_components = [c["name"] for _, p in pipelines_for_tests() for c in p]
     for cls in registry.component_classes:
+        # different tokenization is needed
+        if cls.name == "PreTrainedLMFeaturizer":
+            continue
         assert (
             cls.name in all_components
         ), "`all_components` template is missing component."

From 2104a8bb58abd08fbd8f15d88059f41d365b3167 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 13:15:49 +0100
Subject: [PATCH 025/633] write hermit predictions

---
 rasa/nlu/test.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 831001eb93f0..a4bc58cd20a2 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -1053,6 +1053,10 @@ def run_evaluation(
             entity_results, extractors, output_directory, successes, errors
         )
 
+        write_prediction_for_hermit_evaluation(
+            entity_results, intent_results, extractors, output_directory
+        )
+
     return result
 
 
@@ -1409,6 +1413,58 @@ def return_entity_results(results: EntityMetrics, dataset_name: Text) -> None:
         return_results(result, dataset_name)
 
 
+def write_prediction_for_hermit_evaluation(
+    entity_results, intent_results, extractors, output_directory
+):
+    import json
+
+    out = []
+    aligned_predictions = align_all_entity_predictions(entity_results, extractors)
+
+    for intent_result, e_pred, entity_result in zip(
+        intent_results, aligned_predictions, entity_results
+    ):
+
+        entity_gold = e_pred["target_labels"]
+        entity_pred = e_pred["extractor_labels"]["EmbeddingIntentClassifier"]
+
+        last = "O"
+        for j in range(len(entity_pred)):
+            if entity_pred[j] != "O" and last != entity_pred[j]:
+                last = entity_pred[j]
+                entity_pred[j] = "B-" + entity_pred[j]
+            elif entity_pred[j] != "O" and last == entity_pred[j]:
+                last = entity_pred[j]
+                entity_pred[j] = "I-" + entity_pred[j]
+            else:
+                last = entity_pred[j]
+
+        last = "O"
+        for j in range(len(entity_gold)):
+            if entity_gold[j] != "O" and last != entity_gold[j]:
+                last = entity_gold[j]
+                entity_gold[j] = "B-" + entity_gold[j]
+            elif entity_gold[j] != "O" and last == entity_gold[j]:
+                last = entity_gold[j]
+                entity_gold[j] = "I-" + entity_gold[j]
+            else:
+                last = entity_gold[j]
+
+        obj = {
+            "tokens": [t.text for t in entity_result.tokens],
+            "intent_gold": [intent_result.intent_target for _ in entity_result.tokens],
+            "intent_pred": [
+                intent_result.intent_prediction for _ in entity_result.tokens
+            ],
+            "frame_element_gold": entity_gold,
+            "frame_element_pred": entity_pred,
+        }
+        out.append(obj)
+
+    with open(os.path.join(output_directory, "hermit_eval.json"), "w") as outfile:
+        json.dump(out, outfile, indent=2)
+
+
 if __name__ == "__main__":
     raise RuntimeError(
         "Calling `rasa.nlu.test` directly is no longer supported. Please use "

From dbcfd835b4dd727eb1aefa2894fed0c40eb2c0f4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 14:51:48 +0100
Subject: [PATCH 026/633] remove not needed imports

---
 .../dense_featurizer/pretrained_lm_featurizer.py          | 1 -
 rasa/nlu/registry.py                                      | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index 11214a1f9538..25a9370b98d7 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -1,4 +1,3 @@
-import logging
 from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 1879100359ee..776854e296f4 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -8,6 +8,9 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Type
 
+from rasa.nlu.featurizers.dense_featurizer.pretrained_lm_featurizer import (
+    PreTrainedLMFeaturizer,
+)
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
@@ -22,12 +25,9 @@
     CountVectorsFeaturizer,
 )
 from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
-from rasa.nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-from rasa.nlu.featurizers.dense_featurizer.pretrained_lm_featurizer import (
-    PreTrainedLMFeaturizer,
-)
+
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer

From 5e6d5b6328f3da6ef13ee9b7a00df5186964c682 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 14:52:39 +0100
Subject: [PATCH 027/633] fix split train val

---
 rasa/utils/train_utils.py       | 12 +++++++-----
 tests/utils/test_train_utils.py |  6 ++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 4fa78d7390b4..4d278385c41a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -155,14 +155,16 @@ def convert_train_test_split(
     # order is kept, e.g. same order as session data keys
 
     # train datasets have an even index
-    for i in range(len(session_data)):
-        session_data_train[keys[i]].append(
-            combine_features(output_values[i * 2], solo_values[i])
-        )
+    for i in range(len(keys)):
+        for j in range(len(session_data[keys[i]])):
+            session_data_train[keys[i]].append(
+                combine_features(output_values[(i + j) * 2], solo_values[i])
+            )
 
     # val datasets have an odd index
     for i in range(len(session_data)):
-        session_data_val[keys[i]].append(output_values[(i * 2) + 1])
+        for j in range(len(session_data[keys[i]])):
+            session_data_val[keys[i]].append(output_values[((i + j) * 2) + 1])
 
     return session_data_train, session_data_val
 
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index dcad2a8790c2..bdbb89cde492 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -84,6 +84,12 @@ def test_train_val_split(session_data: SessionDataType):
         session_data, 2, 42, "intent_ids"
     )
 
+    for k, values in session_data.items():
+        assert len(values) == len(train_session_data[k])
+        assert len(values) == len(val_session_data[k])
+        for i, v in enumerate(values):
+            assert v[0].dtype == train_session_data[k][i][0].dtype
+
     for values in train_session_data.values():
         for v in values:
             assert v.shape[0] == 3

From 31916fde94a715cc99bee2c5f15b921fea931ab1 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 15 Nov 2019 16:08:35 +0100
Subject: [PATCH 028/633] added tokenizer

---
 .../pretrained_lm_featurizer.py               | 99 +++++++++++++------
 rasa/nlu/registry.py                          |  2 +
 rasa/nlu/tokenizers/tokenizer.py              |  3 +-
 3 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index 390bd334a4a0..93e6e70780ce 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -25,34 +25,34 @@
 model_dictionary = {
     "bert-base-uncased": BertModel,
     "openai-gpt": OpenAIGPTModel,
-    "gpt2": GPT2Model,
-    "transfo-xl-wt103": TransfoXLModel,
-    "xlnet-base-cased": XLNetModel,
-    "xlm-mlm-enfr-1024": XLMModel,
-    "distilbert-base-uncased": DistilBertModel,
-    "roberta-base": RobertaModel,
+    # "gpt2": GPT2Model,
+    # "transfo-xl-wt103": TransfoXLModel,
+    # "xlnet-base-cased": XLNetModel,
+    # "xlm-mlm-enfr-1024": XLMModel,
+    # "distilbert-base-uncased": DistilBertModel,
+    # "roberta-base": RobertaModel,
 }
 
 tokenizer_dictionary = {
     "bert-base-uncased": BertTokenizer,
     "openai-gpt": OpenAIGPTTokenizer,
-    "gpt2": GPT2Tokenizer,
-    "transfo-xl-wt103": TransfoXLTokenizer,
-    "xlnet-base-cased": XLNetTokenizer,
-    "xlm-mlm-enfr-1024": XLMTokenizer,
-    "distilbert-base-uncased": DistilBertTokenizer,
-    "roberta-base": RobertaTokenizer,
+    # "gpt2": GPT2Tokenizer,
+    # "transfo-xl-wt103": TransfoXLTokenizer,
+    # "xlnet-base-cased": XLNetTokenizer,
+    # "xlm-mlm-enfr-1024": XLMTokenizer,
+    # "distilbert-base-uncased": DistilBertTokenizer,
+    # "roberta-base": RobertaTokenizer,
 }
 
 special_tokens_present = {
     "bert-base-uncased": True,
     "openai-gpt": False,
-    "gpt2": False,
-    "transfo-xl-wt103": False,
-    "xlnet-base-cased": True,
-    "xlm-mlm-enfr-1024": True,
-    "distilbert-base-uncased": True,
-    "roberta-base": True,
+    # "gpt2": False,
+    # "transfo-xl-wt103": False,
+    # "xlnet-base-cased": True,
+    # "xlm-mlm-enfr-1024": True,
+    # "distilbert-base-uncased": True,
+    # "roberta-base": True,
 }
 
 
@@ -70,19 +70,23 @@ class PreTrainedLMFeaturizer(Featurizer):
 
     def _load_transformers_params(self):
 
-        self.model_key = self.component_config["model_key"]
+        self.lm_key = self.component_config["lm_key"]
 
-        if self.model_key not in tokenizer_dictionary:
-            logger.error("{} not a valid model key name".format(self.model_key))
+        if self.lm_key not in tokenizer_dictionary:
+            logger.error("{} not a valid model key name".format(self.lm_key))
             raise
 
-        logger.info("Loading Tokenizer and Model for {}".format(self.model_key))
-        self.tokenizer = tokenizer_dictionary[self.model_key].from_pretrained(
-            self.model_key
-        )
-        self.model = model_dictionary[self.model_key].from_pretrained(self.model_key)
-        self.contains_special_token = special_tokens_present[self.model_key]
-        self.model.eval()
+        logger.info("Loading Tokenizer and Model for {}".format(self.lm_key))
+        self.tokenizer = tokenizer_dictionary[self.lm_key].from_pretrained(self.lm_key)
+        self.model = model_dictionary[self.lm_key].from_pretrained(self.lm_key)
+        self.contains_special_token = special_tokens_present[self.lm_key]
+        if self.contains_special_token:
+            self.pad_token_id = self.tokenizer.pad_token_id
+        else:
+            special_tokens_dict = {"pad_token": "[PAD]"}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            self.model.resize_token_embeddings(len(self.tokenizer))
+            self.pad_token_id = self.tokenizer.pad_token_id
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
@@ -107,7 +111,9 @@ def train(
 
                 end_index = min(start_index + bs, len(training_data.intent_examples))
                 batch_examples = training_data.intent_examples[start_index:end_index]
-                batch_text = [ex.get(attribute) for ex in batch_examples]
+                batch_text = [
+                    self._clean_text(ex.get(attribute)) for ex in batch_examples
+                ]
 
                 batch_feats = self._compute_features(batch_text)
 
@@ -122,12 +128,36 @@ def train(
                         ),
                     )
 
+                    # print(ex.get(attribute), batch_feats[index].shape[0])
+
                 start_index += bs
 
         # for example in training_data.intent_examples:
         #     for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
         #         self._set_lm_features(example, attribute)
 
+    @staticmethod
+    def _clean_text(text):
+
+        cleaned_text = re.sub(
+            # there is a space or an end of a string after it
+            r"[^\w#@&]+(?=\s|$)|"
+            # there is a space or beginning of a string before it
+            # not followed by a number
+            r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
+            # not in between numbers and not . or @ or & or - or #
+            # e.g. 10'000.00 or blabla@gmail.com
+            # and not url characters
+            r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
+            " ",
+            text,
+        )
+
+        if not cleaned_text.strip():
+            cleaned_text = text
+
+        return cleaned_text
+
     def _compute_input_ids(self, batch_examples):
 
         batch_input_ids = []
@@ -144,10 +174,15 @@ def _compute_input_ids(self, batch_examples):
 
         # add padding
         padded_input_ids = []
+
+        # Some models don't contain pad token, we use unknown token as padding token.This doesn't affect the computation
+        # since we compute an attention mask anyways.
+
+        # pad_token_id = self.tokenizer.pad_token_id if self.contains_special_token else self.tokenizer.unk_token_id
         for example_input_ids in batch_input_ids:
             padded_input_ids.append(
                 example_input_ids
-                + [self.tokenizer.pad_token_id] * (max_seq_len - len(example_input_ids))
+                + [self.pad_token_id] * (max_seq_len - len(example_input_ids))
             )
 
         return torch.tensor(padded_input_ids), actual_seq_lengths
@@ -179,13 +214,13 @@ def _compute_features(self, batch_inputs):
             ].numpy()  # Models outputs are now numpy array
             sequence_embedding = last_hidden_states  # First element of batch
 
-            truncated_embeds = self._resolve_special_tokens_in_embeddings(
+            truncated_embeds = self._extract_nonpadded_embeddings(
                 sequence_embedding, actual_seq_lengths
             )
 
             return truncated_embeds
 
-    def _resolve_special_tokens_in_embeddings(self, embeddings, actual_seq_lengths):
+    def _extract_nonpadded_embeddings(self, embeddings, actual_seq_lengths):
 
         truncated_embeds = []
         for index, embedding in enumerate(embeddings):
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 1879100359ee..0a5263e73d2e 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -33,6 +33,7 @@
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.tokenizers.pretrained_lm_tokenizer import PreTrainedLMTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.utils.common import class_from_module_path
@@ -56,6 +57,7 @@
     SpacyTokenizer,
     WhitespaceTokenizer,
     JiebaTokenizer,
+    PreTrainedLMTokenizer,
     # extractors
     SpacyEntityExtractor,
     MitieEntityExtractor,
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index e0981572d4f4..30c39b3b2a2d 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -19,10 +19,11 @@ def __init__(
         offset: int,
         data: Optional[Dict[Text, Any]] = None,
         lemma: Optional[Text] = None,
+        end: int = None,
     ):
         self.offset = offset
         self.text = text
-        self.end = offset + len(text)
+        self.end = end if end else offset + len(text)
         self.data = data if data else {}
         self.lemma = lemma or text
 

From 5692a31a910549f7d08d63331ac479f005645209 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 15 Nov 2019 16:10:31 +0100
Subject: [PATCH 029/633] added tokenizer

---
 .../nlu/tokenizers/pretrained_lm_tokenizer.py | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 rasa/nlu/tokenizers/pretrained_lm_tokenizer.py

diff --git a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
new file mode 100644
index 000000000000..b77e1858efce
--- /dev/null
+++ b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
@@ -0,0 +1,149 @@
+import logging
+import os
+import re
+import scipy.sparse
+from typing import Any, Dict, List, Optional, Text
+from rasa.nlu import utils
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_ATTRIBUTES,
+    MESSAGE_INTENT_ATTRIBUTE,
+    SPACY_FEATURIZABLE_ATTRIBUTES,
+)
+import torch
+from transformers import *
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+tokenizer_dictionary = {
+    "bert-base-uncased": BertTokenizer,
+    "openai-gpt": OpenAIGPTTokenizer,
+    # "gpt2": GPT2Tokenizer,
+    # "transfo-xl-wt103": TransfoXLTokenizer,
+    # "xlnet-base-cased": XLNetTokenizer,
+    # "xlm-mlm-enfr-1024": XLMTokenizer,
+    # "distilbert-base-uncased": DistilBertTokenizer,
+    # "roberta-base": RobertaTokenizer,
+}
+
+special_tokens_present = {
+    "bert-base-uncased": True,
+    "openai-gpt": False,
+    # "gpt2": False,
+    # "transfo-xl-wt103": False,
+    # "xlnet-base-cased": True,
+    # "xlm-mlm-enfr-1024": True,
+    # "distilbert-base-uncased": True,
+    # "roberta-base": True,
+}
+
+
+class PreTrainedLMTokenizer(Tokenizer):
+
+    provides = [
+        MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # model key identified by HF Transformers
+        "use_cls_token": True,
+        "lm_key": "bert-base-uncased",
+    }
+
+    def _load_tokenizer_params(self):
+
+        self.lm_key = self.component_config["lm_key"]
+
+        if self.lm_key not in tokenizer_dictionary:
+            logger.error("{} not a valid model key name".format(self.lm_key))
+            raise
+
+        logger.info("Loading Tokenizer for {}".format(self.lm_key))
+        self.tokenizer = tokenizer_dictionary[self.lm_key].from_pretrained(self.lm_key)
+        self.contains_special_token = special_tokens_present[self.lm_key]
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+
+        super(PreTrainedLMTokenizer, self).__init__(component_config)
+
+        self._load_tokenizer_params()
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig],
+        **kwargs: Any,
+    ) -> None:
+
+        for example in training_data.intent_examples:
+            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+                example.set(
+                    MESSAGE_TOKENS_NAMES[attribute],
+                    self._get_lm_tokens(example, attribute),
+                )
+
+    def _get_lm_tokens(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
+
+        message_attribute_text = example.get(attribute)
+        if message_attribute_text:
+
+            expanded_tokens_list = []
+
+            # We assume that whitespace tokenizer was used before this and hence tokens attribute is set.
+            space_tokens_list = example.get(MESSAGE_TOKENS_NAMES[attribute])
+
+            for token in space_tokens_list:
+
+                token_start, token_end, token_text = token.offset, token.end, token.text
+
+                # Encode text
+
+                # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
+                split_token_ids = self.tokenizer.encode(token_text)
+
+                split_token_strings = self.tokenizer.convert_ids_to_tokens(
+                    split_token_ids
+                )
+
+                # print(split_token_strings)
+
+                current_token_offset = token_start
+                for index, string in enumerate(split_token_strings):
+                    if index == 0:
+                        expanded_tokens_list.append(
+                            Token(
+                                string,
+                                token_start,
+                                end=current_token_offset + len(string),
+                            )
+                        )
+                    elif index == len(split_token_strings) - 1:
+                        expanded_tokens_list.append(
+                            Token(string, current_token_offset, end=token_end)
+                        )
+                    else:
+                        expanded_tokens_list.append(
+                            Token(
+                                string,
+                                current_token_offset,
+                                end=current_token_offset + len(string),
+                            )
+                        )
+                    current_token_offset += len(string)
+
+            expanded_tokens_list = self.add_cls_token(expanded_tokens_list, attribute)
+
+            # print(message_attribute_text, len(space_tokens_list), len(expanded_tokens_list))
+
+            return expanded_tokens_list
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        tokens = self._get_lm_tokens(message)
+        message.set(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], tokens)

From d74c73c1de107749f31a58748f02ff94a87f2302 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 15 Nov 2019 16:25:47 +0100
Subject: [PATCH 030/633] fix for length

---
 rasa/nlu/tokenizers/pretrained_lm_tokenizer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
index b77e1858efce..e1b687774836 100644
--- a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
@@ -116,12 +116,12 @@ def _get_lm_tokens(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
                 current_token_offset = token_start
                 for index, string in enumerate(split_token_strings):
                     if index == 0:
+                        if index == len(split_token_strings) - 1:
+                            s_token_end = token_end
+                        else:
+                            s_token_end = current_token_offset + len(string)
                         expanded_tokens_list.append(
-                            Token(
-                                string,
-                                token_start,
-                                end=current_token_offset + len(string),
-                            )
+                            Token(string, token_start, end=s_token_end)
                         )
                     elif index == len(split_token_strings) - 1:
                         expanded_tokens_list.append(

From c02f5e7050e96c14a3fcab7365b1ebb7a117d4a8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 16:26:42 +0100
Subject: [PATCH 031/633] fix import

---
 .../nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index 7fa62f2ff251..c18ded78fe64 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -8,6 +8,7 @@
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 import torch
+import re
 from transformers import *
 import numpy as np
 

From cad9302ee911b37f9ce21baa75665981513c3bb9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 17:55:05 +0100
Subject: [PATCH 032/633] update requirements.txt

---
 requirements.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index b517977199bd..c514abc2bff8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,3 +60,8 @@ gast==0.2.2
 torch
 torchvision
 transformers
+# for hermit evaluation
+pandas
+progress
+# to calculate f1 score in new architecture
+git+https://github.com/guillaumegenthial/tf_metrics.git
\ No newline at end of file

From dbe23c8f0de7270027d4a7172c31283b0602bc66 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 18 Nov 2019 09:51:49 +0100
Subject: [PATCH 033/633] fix process in lm featurizer

---
 .../featurizers/dense_featurizer/pretrained_lm_featurizer.py  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index c18ded78fe64..f5c360355e3f 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -278,7 +278,9 @@ def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        feats = self._compute_features([message.get(MESSAGE_TEXT_ATTRIBUTE)])
+        cleaned_text = self._clean_text(message.get(MESSAGE_TEXT_ATTRIBUTE))
+
+        feats = self._compute_features([cleaned_text])
         message.set(
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(

From 1d74397fd23359ad2472d4a7bb327562feea7c9f Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 18 Nov 2019 12:05:27 +0100
Subject: [PATCH 034/633] added convert featurizer

---
 .../dense_featurizer/convert_featurizer.py    | 202 ++++++++++++++++++
 .../pretrained_lm_featurizer.py               |   2 +-
 rasa/nlu/registry.py                          |   4 +
 rasa/nlu/tokenizers/convert_tokenizer.py      | 138 ++++++++++++
 4 files changed, 345 insertions(+), 1 deletion(-)
 create mode 100644 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
 create mode 100644 rasa/nlu/tokenizers/convert_tokenizer.py

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
new file mode 100644
index 000000000000..efdc91713230
--- /dev/null
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -0,0 +1,202 @@
+import logging
+import os
+import re
+import scipy.sparse
+from rasa.nlu.featurizers.featurzier import Featurizer
+from typing import Any, Dict, List, Optional, Text
+from rasa.nlu import utils
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_ATTRIBUTES,
+    MESSAGE_INTENT_ATTRIBUTE,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    SPACY_FEATURIZABLE_ATTRIBUTES,
+)
+import numpy as np
+import tensorflow as tf
+import tensorflow_text
+import tensorflow_hub as tfhub
+
+logger = logging.getLogger(__name__)
+
+
+class ConvertFeaturizer(Featurizer):
+    provides = [
+        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # model key identified by HF Transformers
+        "return_sequence": True
+    }
+
+    def _load_model(self):
+
+        self.return_sequence = self.component_config["return_sequence"]
+
+        self.graph = tf.Graph()
+        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+
+        with self.graph.as_default():
+            self.session = tf.Session()
+            self.module = tfhub.Module(model_url)
+
+            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
+            if self.return_sequence:
+                self.sequence_encoding_tensor = self.module(
+                    self.text_placeholder, signature="encode_sequence", as_dict=True
+                )
+                self.tokenized = self.module(
+                    self.text_placeholder, signature="tokenize"
+                )
+            self.sentence_encoding_tensor = self.module(self.text_placeholder)
+            self.session.run(tf.tables_initializer())
+            self.session.run(tf.global_variables_initializer())
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+
+        super(ConvertFeaturizer, self).__init__(component_config)
+
+        self._load_model()
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig],
+        **kwargs: Any,
+    ) -> None:
+
+        bs = 1
+
+        for attribute in [MESSAGE_TEXT_ATTRIBUTE]:
+
+            start_index = 0
+
+            while start_index < len(training_data.intent_examples):
+
+                end_index = min(start_index + bs, len(training_data.intent_examples))
+                batch_examples = training_data.intent_examples[start_index:end_index]
+
+                batch_text = [
+                    self._clean_text(ex.get(attribute)) for ex in batch_examples
+                ]
+
+                batch_feats = self._compute_features(batch_text)
+
+                for index, ex in enumerate(batch_examples):
+
+                    # print(batch_text[index], batch_feats[index].shape)
+                    ex.set(
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                        self._combine_with_existing_dense_features(
+                            ex,
+                            batch_feats[index],
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                        ),
+                    )
+
+                start_index += bs
+
+    @staticmethod
+    def _clean_text(text):
+
+        cleaned_text = re.sub(
+            # there is a space or an end of a string after it
+            r"[^\w#@&]+(?=\s|$)|"
+            # there is a space or beginning of a string before it
+            # not followed by a number
+            r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
+            # not in between numbers and not . or @ or & or - or #
+            # e.g. 10'000.00 or blabla@gmail.com
+            # and not url characters
+            r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
+            " ",
+            text,
+        )
+
+        if not cleaned_text.strip():
+            cleaned_text = text
+
+        return cleaned_text.strip()
+
+    def _tokenize(self, sentence):
+
+        return self.session.run(
+            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
+        )
+
+    def _compute_features(self, batch_examples):
+
+        sentence_encodings = self.session.run(
+            self.sentence_encoding_tensor,
+            feed_dict={self.text_placeholder: batch_examples},
+        )
+
+        # convert them to a sequence
+        sentence_encodings = np.reshape(
+            sentence_encodings, (len(batch_examples), 1, -1)
+        )
+
+        if self.return_sequence:
+
+            final_embeddings = []
+
+            batch_tokenized = [self._tokenize(sentence) for sentence in batch_examples]
+
+            actual_lens = [token_vector.shape[1] for token_vector in batch_tokenized]
+
+            sequence_encodings = self.session.run(
+                self.sequence_encoding_tensor,
+                feed_dict={self.text_placeholder: batch_examples},
+            )["sequence_encoding"]
+
+            for index in range(len(batch_examples)):
+
+                seq_len = actual_lens[index]
+                seq_enc = sequence_encodings[index][:seq_len]
+                sent_enc = sentence_encodings[index]
+
+                # tile seq enc to duplicate
+                seq_enc = np.tile(seq_enc, (1, 2))
+
+                # add sent_enc to the end
+                seq_enc = np.concatenate([seq_enc, sent_enc], axis=0)
+
+                final_embeddings.append(seq_enc)
+
+            return final_embeddings
+
+        return sentence_encodings
+
+    def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
+
+        message_attribute_text = example.get(attribute)
+        if message_attribute_text:
+            # Encode text
+            features = self.module([message_attribute_text])[0]
+            features = self._combine_with_existing_features(
+                example, features, MESSAGE_VECTOR_FEATURE_NAMES[attribute]
+            )
+            # print(features.shape)
+            example.set(MESSAGE_VECTOR_FEATURE_NAMES[attribute], features)
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        feats = self._compute_features(
+            [self._clean_text(message.get(MESSAGE_TEXT_ATTRIBUTE))]
+        )[0]
+
+        message.set(
+            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            self._combine_with_existing_dense_features(
+                message,
+                feats,
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            ),
+        )
+        # self._set_lm_features(message)
diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
index 93e6e70780ce..3ca2ba09f1b2 100644
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
@@ -156,7 +156,7 @@ def _clean_text(text):
         if not cleaned_text.strip():
             cleaned_text = text
 
-        return cleaned_text
+        return cleaned_text.strip()
 
     def _compute_input_ids(self, batch_examples):
 
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 0a5263e73d2e..cddf7fcfd227 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -28,12 +28,14 @@
 from rasa.nlu.featurizers.dense_featurizer.pretrained_lm_featurizer import (
     PreTrainedLMFeaturizer,
 )
+from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConvertFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.tokenizers.pretrained_lm_tokenizer import PreTrainedLMTokenizer
+from rasa.nlu.tokenizers.convert_tokenizer import ConvertTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.utils.common import class_from_module_path
@@ -58,6 +60,7 @@
     WhitespaceTokenizer,
     JiebaTokenizer,
     PreTrainedLMTokenizer,
+    ConvertTokenizer,
     # extractors
     SpacyEntityExtractor,
     MitieEntityExtractor,
@@ -70,6 +73,7 @@
     RegexFeaturizer,
     CountVectorsFeaturizer,
     PreTrainedLMFeaturizer,
+    ConvertFeaturizer,
     # classifiers
     SklearnIntentClassifier,
     MitieIntentClassifier,
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
new file mode 100644
index 000000000000..1cbeadd12f95
--- /dev/null
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -0,0 +1,138 @@
+import logging
+import os
+import re
+import scipy.sparse
+from typing import Any, Dict, List, Optional, Text
+from rasa.nlu import utils
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_ATTRIBUTES,
+    MESSAGE_INTENT_ATTRIBUTE,
+    SPACY_FEATURIZABLE_ATTRIBUTES,
+)
+import torch
+from transformers import *
+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as tfhub
+
+logger = logging.getLogger(__name__)
+
+
+class ConvertTokenizer(Tokenizer):
+
+    provides = [
+        MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # model key identified by HF Transformers
+        "use_cls_token": True
+    }
+
+    def _load_tokenizer_params(self):
+
+        self.graph = tf.Graph()
+        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+
+        with self.graph.as_default():
+            self.session = tf.Session()
+            self.module = tfhub.Module(model_url)
+
+            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
+            self.tokenized = self.module(self.text_placeholder, signature="tokenize")
+
+            self.session.run(tf.tables_initializer())
+            self.session.run(tf.global_variables_initializer())
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+
+        super(ConvertTokenizer, self).__init__(component_config)
+
+        self._load_tokenizer_params()
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig],
+        **kwargs: Any,
+    ) -> None:
+
+        for example in training_data.intent_examples:
+            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+                example.set(
+                    MESSAGE_TOKENS_NAMES[attribute],
+                    self._get_lm_tokens(example, attribute),
+                )
+
+    def _tokenize(self, sentence):
+
+        return self.session.run(
+            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
+        )
+
+    def _get_lm_tokens(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
+
+        message_attribute_text = example.get(attribute)
+        if message_attribute_text:
+
+            expanded_tokens_list = []
+
+            # We assume that whitespace tokenizer was used before this and hence tokens attribute is set.
+            space_tokens_list = example.get(MESSAGE_TOKENS_NAMES[attribute])
+
+            for token in space_tokens_list:
+
+                token_start, token_end, token_text = token.offset, token.end, token.text
+
+                # Encode text
+
+                split_token_strings = self._tokenize(token_text)[0]
+
+                # print(split_token_strings)
+
+                split_token_strings = [
+                    string.decode("utf-8") for string in split_token_strings
+                ]
+
+                # print(token_text, split_token_strings)
+
+                current_token_offset = token_start
+                for index, string in enumerate(split_token_strings):
+                    if index == 0:
+                        if index == len(split_token_strings) - 1:
+                            s_token_end = token_end
+                        else:
+                            s_token_end = current_token_offset + len(string)
+                        expanded_tokens_list.append(
+                            Token(string, token_start, end=s_token_end)
+                        )
+                    elif index == len(split_token_strings) - 1:
+                        expanded_tokens_list.append(
+                            Token(string, current_token_offset, end=token_end)
+                        )
+                    else:
+                        expanded_tokens_list.append(
+                            Token(
+                                string,
+                                current_token_offset,
+                                end=current_token_offset + len(string),
+                            )
+                        )
+                    current_token_offset += len(string)
+
+            expanded_tokens_list = self.add_cls_token(expanded_tokens_list, attribute)
+
+            # print(message_attribute_text, len(space_tokens_list), len(expanded_tokens_list))
+
+            return expanded_tokens_list
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        tokens = self._get_lm_tokens(message)
+        message.set(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], tokens)

From 50ea652d80b535e8ab5caf7f071e409bf1700681 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 18 Nov 2019 12:06:40 +0100
Subject: [PATCH 035/633] added convert featurizer

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index efdc91713230..4297a09a021c 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -71,7 +71,7 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        bs = 1
+        bs = 64
 
         for attribute in [MESSAGE_TEXT_ATTRIBUTE]:
 

From 55b9196c0a75256f7c2ed62833b9880c9a00d299 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 18 Nov 2019 12:56:21 +0100
Subject: [PATCH 036/633] stripped extra space

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 4297a09a021c..0576081df3a6 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -119,6 +119,9 @@ def _clean_text(text):
             text,
         )
 
+        # remove multiple occurences of ' '
+        cleaned_text = re.sub(" +", " ", cleaned_text)
+
         if not cleaned_text.strip():
             cleaned_text = text
 

From 2b80979eadb2a004ba6a138becb6b580c31d9e84 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 19 Nov 2019 10:18:13 +0100
Subject: [PATCH 037/633] Make optimizer configurable

---
 .../embedding_intent_classifier.py            | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 5b5e9fb4e7e2..0738cd0e4c7d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -97,6 +97,8 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "epochs": 300,
         # set random seed to any int to get reproducible results
         "random_seed": None,
+        # optimizer
+        "optimizer": "Adam",  # can bei either 'Adam' (default) or 'Nadam'
         # embedding parameters
         # default dense dimension used if no dense features are present
         "dense_dim": 512,
@@ -137,8 +139,6 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
         "named_entity_recognition": True,
-        # number of entity tags
-        "num_tags": None,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -180,6 +180,8 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.batch_in_size = config["batch_size"]
         self.batch_in_strategy = config["batch_strategy"]
 
+        self.optimizer = config["optimizer"]
+
         self.epochs = config["epochs"]
 
         self.random_seed = self.component_config["random_seed"]
@@ -233,7 +235,6 @@ def _load_params(self) -> None:
         self.named_entity_recognition = self.component_config[
             "named_entity_recognition"
         ]
-        self.num_tags = self.component_config["num_tags"]
 
     # package safety checks
     @classmethod
@@ -292,6 +293,9 @@ def __init__(
         self._train_op = None
         self._is_training = None
 
+        # number of entity tags
+        self.num_tags = 0
+
         self.attention_weights = attention_weights
 
     # training data helpers:
@@ -1093,9 +1097,14 @@ def train(
 
             metrics = self._build_tf_train_graph(session_data)
 
-            # define which optimizer to use
+            # calculate overall loss
             loss = tf.add_n(list(metrics.loss.values()))
-            self._train_op = tf.train.AdamOptimizer().minimize(loss)
+
+            # define which optimizer to use
+            if self.optimizer.lower() == "nadam":
+                self._train_op = tf.contrib.opt.NadamOptimizer().minimize(loss)
+            else:
+                self._train_op = tf.train.AdamOptimizer().minimize(loss)
 
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)

From c08dbbac1fa466a4a0512cf2e9294a7aeaff8870 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 19 Nov 2019 13:15:51 +0100
Subject: [PATCH 038/633] add option to normalize loss

---
 .../classifiers/embedding_intent_classifier.py  | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 0738cd0e4c7d..625f3ed9243f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -68,7 +68,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_a": [256, 128],
+        "hidden_layers_sizes_a": [],
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
         "hidden_layers_sizes_b": [],
@@ -98,7 +98,8 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # set random seed to any int to get reproducible results
         "random_seed": None,
         # optimizer
-        "optimizer": "Adam",  # can bei either 'Adam' (default) or 'Nadam'
+        "optimizer": "Adam",  # can be either 'Adam' (default) or 'Nadam'
+        "normalize_loss": False,
         # embedding parameters
         # default dense dimension used if no dense features are present
         "dense_dim": 512,
@@ -181,7 +182,7 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.batch_in_strategy = config["batch_strategy"]
 
         self.optimizer = config["optimizer"]
-
+        self.normalize_loss = config["normalize_loss"]
         self.epochs = config["epochs"]
 
         self.random_seed = self.component_config["random_seed"]
@@ -292,6 +293,7 @@ def __init__(
         self._iterator = None
         self._train_op = None
         self._is_training = None
+        self._in_layer_norm = {}
 
         # number of entity tags
         self.num_tags = 0
@@ -642,6 +644,10 @@ def combine_sparse_dense_features(
             else:
                 dense_features.append(f)
 
+        # if self._in_layer_norm.get(name) is None:
+        #     self._in_layer_norm[name] = tf.keras.layers.LayerNormalization(name=name)
+
+        # return self._in_layer_norm[name](tf.concat(dense_features, axis=-1))
         return tf.concat(dense_features, axis=-1)
 
     def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
@@ -1098,7 +1104,10 @@ def train(
             metrics = self._build_tf_train_graph(session_data)
 
             # calculate overall loss
-            loss = tf.add_n(list(metrics.loss.values()))
+            if self.normalize_loss:
+                loss = tf.add_n([_loss / (tf.stop_gradient(_loss) + 1e-8) for _loss in metrics.loss.values()])
+            else:
+                loss = tf.add_n(list(metrics.loss.values()))
 
             # define which optimizer to use
             if self.optimizer.lower() == "nadam":

From d5e20bba4cb1f53b268613d1f30d4b1a57a36fb0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 20 Nov 2019 09:42:42 +0100
Subject: [PATCH 039/633] Add plotter for training curves.

---
 .../embedding_intent_classifier.py            |  23 +++-
 rasa/utils/plotter.py                         | 113 ++++++++++++++++++
 rasa/utils/train_utils.py                     |  39 +++++-
 requirements.txt                              |   6 +-
 4 files changed, 175 insertions(+), 6 deletions(-)
 create mode 100644 rasa/utils/plotter.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 625f3ed9243f..d9ea913d09b3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,16 +1,19 @@
 import logging
-from collections import defaultdict
 
 import numpy as np
 import os
 import pickle
 import scipy.sparse
 import typing
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
 import warnings
 
+from typing import Any, Dict, List, Optional, Text, Tuple, Union
+from shutil import copyfile
+
 from tf_metrics import f1
 
+import rasa.utils.io as io_utils
+from utils.plotter import Plotter
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -300,6 +303,8 @@ def __init__(
 
         self.attention_weights = attention_weights
 
+        self.training_log_file = io_utils.create_temporary_file("")
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -1105,7 +1110,12 @@ def train(
 
             # calculate overall loss
             if self.normalize_loss:
-                loss = tf.add_n([_loss / (tf.stop_gradient(_loss) + 1e-8) for _loss in metrics.loss.values()])
+                loss = tf.add_n(
+                    [
+                        _loss / (tf.stop_gradient(_loss) + 1e-8)
+                        for _loss in metrics.loss.values()
+                    ]
+                )
             else:
                 loss = tf.add_n(list(metrics.loss.values()))
 
@@ -1130,6 +1140,7 @@ def train(
                 self.batch_in_size,
                 self.evaluate_on_num_examples,
                 self.evaluate_every_num_epochs,
+                output_file=self.training_log_file,
             )
 
             # rebuild the graph for prediction
@@ -1164,6 +1175,12 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
 
         checkpoint = os.path.join(model_dir, file_name + ".ckpt")
 
+        # plot training curves
+        plotter = Plotter()
+        plotter.plot_training_curves(self.training_log_file, model_dir)
+        # copy trainig log file
+        copyfile(self.training_log_file, os.path.join(model_dir, "training-log.tsv"))
+
         try:
             os.makedirs(os.path.dirname(checkpoint))
         except OSError as e:
diff --git a/rasa/utils/plotter.py b/rasa/utils/plotter.py
new file mode 100644
index 000000000000..28840052a27b
--- /dev/null
+++ b/rasa/utils/plotter.py
@@ -0,0 +1,113 @@
+from pathlib import Path
+from typing import Union, List, Text
+
+import numpy as np
+import csv
+
+
+# to enable %matplotlib inline if running in ipynb
+from IPython import get_ipython
+
+ipy = get_ipython()
+if ipy is not None:
+    ipy.run_line_magic("matplotlib", "inline")
+
+
+import matplotlib.pyplot as plt
+
+
+class Plotter(object):
+    """
+    Plots training parameters (loss, f-score, and accuracy) and training weights over time.
+    Input files are the output files 'loss.tsv' and 'weights.txt' from training either a sequence tagger or text
+    classification model.
+    """
+
+    @staticmethod
+    def _extract_evaluation_data(
+        file_name: Text, score: str = "loss", prefix: str = "i"
+    ) -> dict:
+        training_curves = {"train": [], "val": []}
+
+        with open(file_name, "r") as tsvin:
+            tsvin = csv.reader(tsvin, delimiter="\t")
+
+            # determine the column index of loss, f-score and accuracy for train, dev and test split
+            row = next(tsvin, None)
+
+            score = score.upper()
+
+            TRAIN_SCORE = (
+                row.index(f"{prefix.upper()}_{score.upper()}")
+                if f"{prefix.upper()}_{score.upper()}" in row
+                else None
+            )
+            VAL_SCORE = (
+                row.index(f"VAL_{prefix.upper()}_{score.upper()}")
+                if f"VAL_{prefix.upper()}_{score.upper()}" in row
+                else None
+            )
+
+            # then get all relevant values from the tsv
+            for row in tsvin:
+
+                if TRAIN_SCORE is not None:
+                    if row[TRAIN_SCORE] != "_":
+                        training_curves["train"].append(float(row[TRAIN_SCORE]))
+
+                if VAL_SCORE is not None:
+                    if VAL_SCORE < len(row) and row[VAL_SCORE] != "_":
+                        training_curves["val"].append(float(row[VAL_SCORE]))
+                    else:
+                        training_curves["val"].append(0.0)
+
+        return training_curves
+
+    def plot_training_curves(self, file_name: Union[Text], output_folder: Text):
+        if type(output_folder) is str:
+            output_folder = Path(output_folder)
+
+        metrics = {
+            "intent": {"scores": ["loss", "acc"], "prefix": "i"},
+            "entity": {"scores": ["loss", "f1"], "prefix": "e"},
+        }
+
+        for metric_name, metric_values in metrics.items():
+
+            fig = plt.figure(figsize=(15, 10))
+
+            prefix = metric_values["prefix"]
+            scores = metric_values["scores"]
+
+            output_path = output_folder / f"training_{metric_name}.png"
+
+            for i, score in enumerate(scores):
+                training_curves = self._extract_evaluation_data(
+                    file_name, score, prefix
+                )
+
+                plt.subplot(len(scores), 1, i + 1)
+                if training_curves["train"]:
+                    x = np.arange(0, len(training_curves["train"]))
+                    plt.plot(
+                        x,
+                        training_curves["train"],
+                        label=f"train {metric_name} {score}",
+                    )
+                if training_curves["val"]:
+                    x = np.arange(0, len(training_curves["val"]))
+                    plt.plot(
+                        x, training_curves["val"], label=f"val {metric_name} {score}"
+                    )
+
+                plt.legend(bbox_to_anchor=(1.04, 0), loc="lower left", borderaxespad=0)
+                plt.ylabel(f"{metric_name} {score}")
+                plt.xlabel("epochs")
+
+            # save plots
+            plt.tight_layout(pad=1.0)
+            plt.savefig(output_path, dpi=300)
+            print(
+                f"Loss and acc plots are saved in {output_path}"
+            )  # to let user know the path of the save plots
+            plt.close(fig)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 8209dea243b8..85a555d1a811 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -42,8 +42,8 @@
 
 # namedtuple for training metrics
 class TrainingMetrics(NamedTuple):
-    loss: Dict[Text, tf.Tensor]
-    score: Dict[Text, tf.Tensor]
+    loss: Dict[Text, Union[tf.Tensor, float]]
+    score: Dict[Text, Union[tf.Tensor, float]]
 
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
@@ -1196,6 +1196,7 @@ def train_tf_dataset(
     batch_size: Union[List[int], int],
     evaluate_on_num_examples: int,
     evaluate_every_num_epochs: int,
+    output_file: Optional[Text] = None,
 ) -> None:
     """Train tf graph"""
 
@@ -1258,6 +1259,8 @@ def train_tf_dataset(
 
         pbar.set_postfix(postfix_dict)
 
+        _write_training_metrics(output_file, ep, train_metrics, val_metrics)
+
     logger.info("Finished training.")
 
 
@@ -1271,6 +1274,38 @@ def _update_postfix_dict(
     return postfix_dict
 
 
+def _write_training_metrics(
+    output_file: Text,
+    epoch: int,
+    train_metrics: TrainingMetrics,
+    val_metrics: TrainingMetrics,
+):
+    if output_file:
+        import datetime
+
+        # output log file
+        with open(output_file, "a") as f:
+            # make headers on first epoch
+            if epoch == 0:
+                f.write(f"EPOCH\tTIMESTAMP")
+                [f.write(f"\t{key.upper()}") for key in train_metrics.loss.keys()]
+                [f.write(f"\t{key.upper()}") for key in train_metrics.score.keys()]
+                [f.write(f"\tVAL_{key.upper()}") for key in train_metrics.loss.keys()]
+                [f.write(f"\tVAL_{key.upper()}") for key in train_metrics.score.keys()]
+
+            f.write(f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}")
+            [f.write(f"\t{val:.3f}") for val in train_metrics.loss.values()]
+            [f.write(f"\t{val:.3f}") for val in train_metrics.score.values()]
+            [
+                f.write(f"\t{val:.3f}") if val else f.write("\t0.0")
+                for val in val_metrics.loss.values()
+            ]
+            [
+                f.write(f"\t{val:.3f}") if val else f.write("\t0.0")
+                for val in val_metrics.score.values()
+            ]
+
+
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
     """Extract attention probabilities from t2t dict"""
 
diff --git a/requirements.txt b/requirements.txt
index c514abc2bff8..b471ab842263 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ redis==3.3.5
 pymongo[tls,srv]==3.8.0
 numpy==1.16.3
 scipy==1.2.1
-tensorflow==1.15.0
+#tensorflow==1.15.0
 absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
 # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
@@ -57,6 +57,10 @@ PyJWT==1.7.1
 # remove when tensorflow@1.15.x or a pre-release patch is released
 # https://github.com/tensorflow/tensorflow/issues/32319
 gast==0.2.2
+# for new featurizers
+tensorflow==1.14.0
+tensorflow_hub==0.6.0
+tensorflow_text[no-deps]==0.1.0
 torch
 torchvision
 transformers

From e74a9f9231de5954f92678a3f64a1c4faf33882f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 20 Nov 2019 11:00:42 +0100
Subject: [PATCH 040/633] update requiremtens

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b471ab842263..0e47a15034f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,7 +60,7 @@ gast==0.2.2
 # for new featurizers
 tensorflow==1.14.0
 tensorflow_hub==0.6.0
-tensorflow_text[no-deps]==0.1.0
+#tensorflow_text[no-deps]==0.1.0
 torch
 torchvision
 transformers
diff --git a/setup.py b/setup.py
index 9bc541ef48af..3841c7193955 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@
     "pymongo[tls,srv]~=3.8",
     "numpy~=1.16",
     "scipy~=1.2",
-    "tensorflow~=1.15.0",
+    # "tensorflow~=1.15.0",
     # absl is a tensorflow dependency, but produces double logging before 0.8
     # should be removed once tensorflow requires absl > 0.8 on its own
     "absl-py>=0.8.0",

From d1384a496756a06468981dae47fa511793348bb0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 20 Nov 2019 11:08:53 +0100
Subject: [PATCH 041/633] fix import

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d9ea913d09b3..48f4bbce73d4 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -13,7 +13,7 @@
 from tf_metrics import f1
 
 import rasa.utils.io as io_utils
-from utils.plotter import Plotter
+from rasa.utils.plotter import Plotter
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token

From 3c5371ff914d494c31e71a297203b296d8a3fa97 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 20 Nov 2019 11:16:19 +0100
Subject: [PATCH 042/633] update requirements

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 0e47a15034f2..f376f36ee91c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -67,5 +67,7 @@ transformers
 # for hermit evaluation
 pandas
 progress
+# for plotter
+ipython
 # to calculate f1 score in new architecture
 git+https://github.com/guillaumegenthial/tf_metrics.git
\ No newline at end of file

From 819e50dc1f7a471933fad13162eacfb97de7dd74 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 21 Nov 2019 09:18:20 +0100
Subject: [PATCH 043/633] remove not needed imports

---
 .../dense_featurizer/convert_featurizer.py           | 10 +---------
 rasa/nlu/tokenizers/convert_tokenizer.py             | 12 +-----------
 rasa/nlu/tokenizers/pretrained_lm_tokenizer.py       | 12 +-----------
 3 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 0576081df3a6..247e86b875c3 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,24 +1,16 @@
 import logging
-import os
 import re
-import scipy.sparse
 from rasa.nlu.featurizers.featurzier import Featurizer
-from typing import Any, Dict, List, Optional, Text
-from rasa.nlu import utils
+from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 import numpy as np
 import tensorflow as tf
-import tensorflow_text
 import tensorflow_hub as tfhub
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 1cbeadd12f95..c79e8d3cdbd0 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -1,23 +1,13 @@
-import logging
-import os
-import re
-import scipy.sparse
-from typing import Any, Dict, List, Optional, Text
-from rasa.nlu import utils
+from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
-from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
-    MESSAGE_INTENT_ATTRIBUTE,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
-import torch
 from transformers import *
-import numpy as np
 import tensorflow as tf
 import tensorflow_hub as tfhub
 
diff --git a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
index e1b687774836..d9439e98b266 100644
--- a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
@@ -1,23 +1,13 @@
-import logging
-import os
-import re
-import scipy.sparse
-from typing import Any, Dict, List, Optional, Text
-from rasa.nlu import utils
+from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
-from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
-    MESSAGE_INTENT_ATTRIBUTE,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
-import torch
 from transformers import *
-import numpy as np
 
 logger = logging.getLogger(__name__)
 

From 4cf2e1304c5ec7fe525714102a430643d2e94fde Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 21 Nov 2019 10:02:33 +0100
Subject: [PATCH 044/633] remove ' & ' from text for convert tokeinzer.

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 3 +++
 rasa/nlu/tokenizers/whitespace_tokenizer.py                 | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 247e86b875c3..6d2d5a8339a0 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -114,6 +114,9 @@ def _clean_text(text):
         # remove multiple occurences of ' '
         cleaned_text = re.sub(" +", " ", cleaned_text)
 
+        # remove " & "
+        cleaned_text = re.sub("\s&\s", " ", cleaned_text)
+
         if not cleaned_text.strip():
             cleaned_text = text
 
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index fa029b0b43f2..29fa97832e48 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -78,6 +78,9 @@ def tokenize(
                 " ",
                 text,
             ).split()
+            # Fix for convert tokenizer, remove any single '&'
+            words = [w for w in words if w != "&"]
+
             # if we removed everything like smiles `:)`, use the whole text as 1 token
             if not words:
                 words = [text]

From 078a9542e2bf0a5308b8858feb43a00c6f03bbae Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 21 Nov 2019 10:14:51 +0100
Subject: [PATCH 045/633] import tensorflow_text

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 3 +++
 rasa/nlu/tokenizers/convert_tokenizer.py                    | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 6d2d5a8339a0..43b418b1b570 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -13,6 +13,9 @@
 import tensorflow as tf
 import tensorflow_hub as tfhub
 
+# needed to load convert model
+import tensorflow_text
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index c79e8d3cdbd0..4bd16819fd52 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -11,6 +11,9 @@
 import tensorflow as tf
 import tensorflow_hub as tfhub
 
+# needed to load convert model
+import tensorflow_text
+
 logger = logging.getLogger(__name__)
 
 

From 1cef544a1dfe0410848188db6287da8ae2f81dfa Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 21 Nov 2019 11:10:14 +0100
Subject: [PATCH 046/633] add masked_lm_loss

---
 .../embedding_intent_classifier.py            | 102 +++++++++++++++---
 rasa/utils/train_utils.py                     |   4 +-
 2 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 413de8e0fd30..cfca2b307f3f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -143,6 +143,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "intent_classification": True,
         # if true named entity recognition is trained and entities predicted
         "named_entity_recognition": True,
+        "masked_lm_loss": False,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -240,6 +241,7 @@ def _load_params(self) -> None:
         self.named_entity_recognition = self.component_config[
             "named_entity_recognition"
         ]
+        self.masked_lm_loss = self.component_config["masked_lm_loss"]
 
     # package safety checks
     @classmethod
@@ -591,19 +593,21 @@ def _create_session_data(
         session_data = {}
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
+        if label_attribute and (
+            "intent_features" not in session_data or not session_data["intent_features"]
+        ):
+            # no label features are present, get default features from _label_data
+            session_data["intent_features"] = self.use_default_label_features(label_ids)
+
         # explicitly add last dimension to label_ids
         # to track correctly dynamic sequences
         self._add_to_session_data(
             session_data, "intent_ids", [np.expand_dims(label_ids, -1)]
         )
         self._add_to_session_data(session_data, "tag_ids", [tag_ids])
-        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
 
-        if label_attribute and (
-            "intent_features" not in session_data or not session_data["intent_features"]
-        ):
-            # no label features are present, get default features from _label_data
-            session_data["intent_features"] = self.use_default_label_features(label_ids)
+        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
+        self._add_mask_to_session_data(session_data, "intent_mask", "intent_features")
 
         return session_data
 
@@ -630,7 +634,10 @@ def _create_tf_embed_fnn(
         )
 
     def combine_sparse_dense_features(
-        self, features: List[Union["tf.Tensor", "tf.SparseTensor"]], name: Text
+        self,
+        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+        mask: "tf.Tensor",
+        name: Text,
     ) -> "tf.Tensor":
 
         dense_features = []
@@ -654,7 +661,7 @@ def combine_sparse_dense_features(
         #     self._in_layer_norm[name] = tf.keras.layers.LayerNormalization(name=name)
         # return self._in_layer_norm[name](tf.concat(dense_features, axis=-1))
 
-        return tf.concat(dense_features, axis=-1)
+        return tf.concat(dense_features, axis=-1) * mask
 
     def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
         """Create sequence level embedding and mask."""
@@ -698,35 +705,94 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
         batch_data, _ = train_utils.batch_to_session_data(self.batch_in, session_data)
         label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
 
-        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
         mask = batch_data["text_mask"][0]
+        a = self.combine_sparse_dense_features(
+            batch_data["text_features"], mask, "text"
+        )
+
+        if self.masked_lm_loss:
+            a_random = (
+                tf.random.uniform(
+                    tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype
+                )
+                * mask
+            )
+            # a_shuffle = tf.random.shuffle(a)
+
+            other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
+            other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
+            # a_other = tf.stop_gradient(tf.where(other_prob < 0.80, a_random, tf.where(other_prob < 0.90, a_shuffle, a)))
+            a_other = tf.where(other_prob < 0.80, a_random, a)
+
+            lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
+            lm_mask = tf.greater_equal(lm_mask_prob, 0.85)
+            a_pre = tf.where(tf.tile(lm_mask, (1, 1, a.shape[-1])), a_other, a)
+        else:
+            a_pre = a
+            lm_mask = None
 
         # transformer
-        a = self._create_tf_sequence(a, mask)
+        a_transformed = self._create_tf_sequence(a_pre, mask)
 
         metrics = TrainingMetrics(loss={}, score={})
 
+        if self.masked_lm_loss:
+            loss, acc = self._train_mask_graph(a_transformed, a, lm_mask)
+            metrics.loss["m_loss"] = loss
+            metrics.score["m_acc"] = acc
+
         if self.intent_classification:
             b = self.combine_sparse_dense_features(
-                batch_data["intent_features"], "intent"
+                batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
             )
             all_bs = self.combine_sparse_dense_features(
-                label_data["intent_features"], "intent"
+                label_data["intent_features"], label_data["intent_mask"][0], "intent"
             )
 
-            loss, acc = self._train_intent_graph(a, b, all_bs, mask)
+            loss, acc = self._train_intent_graph(a_transformed, b, all_bs, mask)
             metrics.loss["i_loss"] = loss
             metrics.score["i_acc"] = acc
 
         if self.named_entity_recognition:
-            c = self.combine_sparse_dense_features(batch_data["tag_ids"], "tag")
+            c = self.combine_sparse_dense_features(batch_data["tag_ids"], mask, "tag")
 
-            loss, f1_score = self._train_entity_graph(a, c, mask)
+            loss, f1_score = self._train_entity_graph(a_transformed, c, mask)
             metrics.loss["e_loss"] = loss
             metrics.score["e_f1"] = f1_score
 
         return metrics
 
+    def _train_mask_graph(self, a_transformed, a, lm_mask):
+
+        lm_mask = tf.squeeze(lm_mask, -1)
+        a_t_masked = tf.boolean_mask(a_transformed, lm_mask)
+        a_masked = tf.boolean_mask(a, lm_mask)
+
+        a_t_masked_embed = train_utils.create_tf_embed(
+            a_t_masked, self.embed_dim, self.C2, "a_transformed", self.similarity_type
+        )
+
+        a_embed = train_utils.create_tf_embed(
+            a, self.embed_dim, self.C2, "a", self.similarity_type
+        )
+        a_embed_masked = tf.boolean_mask(a_embed, lm_mask)
+
+        return train_utils.calculate_loss_acc(
+            a_t_masked_embed,
+            a_embed_masked,
+            a_masked,
+            a_embed,
+            a,
+            self.num_neg,
+            None,
+            self.loss_type,
+            self.mu_pos,
+            self.mu_neg,
+            self.use_max_sim_neg,
+            self.C_emb,
+            self.scale_loss,
+        )
+
     def _train_entity_graph(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
@@ -833,15 +899,17 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType"):
             self.batch_in, session_data
         )
 
-        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
         mask = batch_data["text_mask"][0]
+        a = self.combine_sparse_dense_features(
+            batch_data["text_features"], mask, "text"
+        )
 
         # transformer
         a = self._create_tf_sequence(a, mask)
 
         if self.intent_classification:
             b = self.combine_sparse_dense_features(
-                batch_data["intent_features"], "intent"
+                batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
             )
             self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 85a555d1a811..b75afa972175 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -854,7 +854,9 @@ def sample_negatives(
         _tf_make_flat(a_embed), _tf_make_flat(b_raw), b_raw, num_neg
     )
 
-    neg_bot_embed, bot_bad_negs = _tf_get_negs(all_b_embed, all_b_raw, b_raw, num_neg)
+    neg_bot_embed, bot_bad_negs = _tf_get_negs(
+        _tf_make_flat(all_b_embed), _tf_make_flat(all_b_raw), b_raw, num_neg
+    )
     return (
         tf.expand_dims(a_embed, -2),
         tf.expand_dims(b_embed, -2),

From a46a3c4a79e061038ab3d1b3222302e2df9833a4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 21 Nov 2019 14:36:01 +0100
Subject: [PATCH 047/633] update mask lm loss

---
 .../embedding_intent_classifier.py            | 64 +++++++++++++------
 1 file changed, 44 insertions(+), 20 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index cfca2b307f3f..9761522bfc12 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -663,7 +663,42 @@ def combine_sparse_dense_features(
 
         return tf.concat(dense_features, axis=-1) * mask
 
-    def _create_tf_sequence(self, a_in, mask) -> "tf.Tensor":
+    @staticmethod
+    def _mask_input(
+        a: "tf.Tensor", mask: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Randomly mask input sequences."""
+
+        # do not mask cls token
+        pad_mask_up_to_last = tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        mask_up_to_last = 1 - pad_mask_up_to_last
+
+        a_random_pad = (
+            tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
+            * pad_mask_up_to_last
+        )
+        a_shuffle = tf.stop_gradient(
+            tf.random.shuffle(a * mask_up_to_last + a_random_pad)
+        )
+
+        mask_vector = tf.get_variable("mask_vector", (1, 1, a.shape[-1]), a.dtype)
+        a_mask = tf.tile(mask_vector, (tf.shape(a)[0], tf.shape(a)[1], 1))
+
+        other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
+        other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
+        a_other = tf.where(
+            other_prob < 0.70, a_mask, tf.where(other_prob < 0.90, a_shuffle, a)
+        )
+
+        lm_mask_prob = (
+            tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask_up_to_last
+        )
+        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
+        a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
+
+        return a_pre, lm_mask_bool
+
+    def _create_tf_sequence(self, a_in: "tf.Tensor", mask: "tf.Tensor") -> "tf.Tensor":
         """Create sequence level embedding and mask."""
 
         a_in = train_utils.create_tf_fnn(
@@ -711,25 +746,9 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
         )
 
         if self.masked_lm_loss:
-            a_random = (
-                tf.random.uniform(
-                    tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype
-                )
-                * mask
-            )
-            # a_shuffle = tf.random.shuffle(a)
-
-            other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
-            other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
-            # a_other = tf.stop_gradient(tf.where(other_prob < 0.80, a_random, tf.where(other_prob < 0.90, a_shuffle, a)))
-            a_other = tf.where(other_prob < 0.80, a_random, a)
-
-            lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
-            lm_mask = tf.greater_equal(lm_mask_prob, 0.85)
-            a_pre = tf.where(tf.tile(lm_mask, (1, 1, a.shape[-1])), a_other, a)
+            a_pre, lm_mask_bool = self._mask_input(a, mask)
         else:
-            a_pre = a
-            lm_mask = None
+            a_pre, lm_mask_bool = (a, None)
 
         # transformer
         a_transformed = self._create_tf_sequence(a_pre, mask)
@@ -737,7 +756,12 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
         metrics = TrainingMetrics(loss={}, score={})
 
         if self.masked_lm_loss:
-            loss, acc = self._train_mask_graph(a_transformed, a, lm_mask)
+            loss, acc = self._train_mask_graph(a_transformed, a, lm_mask_bool)
+            loss, acc = tf.cond(
+                tf.reduce_any(lm_mask_bool),
+                lambda: (loss, acc),
+                lambda: (tf.constant(0, a.dtype), tf.constant(0, a.dtype)),
+            )
             metrics.loss["m_loss"] = loss
             metrics.score["m_acc"] = acc
 

From 49c8c766c9559e2d8a8d28d72fcf38e4f697c886 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 21 Nov 2019 16:02:32 +0100
Subject: [PATCH 048/633] make sure there is at least one element in the lm
 mask

---
 .../embedding_intent_classifier.py            | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9761522bfc12..55f3c0b48aeb 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -687,7 +687,7 @@ def _mask_input(
         other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
         other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
         a_other = tf.where(
-            other_prob < 0.70, a_mask, tf.where(other_prob < 0.90, a_shuffle, a)
+            other_prob < 0.70, a_mask, tf.where(other_prob < 0.80, a_shuffle, a)
         )
 
         lm_mask_prob = (
@@ -757,11 +757,7 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
 
         if self.masked_lm_loss:
             loss, acc = self._train_mask_graph(a_transformed, a, lm_mask_bool)
-            loss, acc = tf.cond(
-                tf.reduce_any(lm_mask_bool),
-                lambda: (loss, acc),
-                lambda: (tf.constant(0, a.dtype), tf.constant(0, a.dtype)),
-            )
+
             metrics.loss["m_loss"] = loss
             metrics.score["m_acc"] = acc
 
@@ -786,11 +782,18 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
 
         return metrics
 
-    def _train_mask_graph(self, a_transformed, a, lm_mask):
+    def _train_mask_graph(self, a_transformed, a, lm_mask_bool):
+
+        # make sure there is at least one element in the mask
+        lm_mask_bool = tf.cond(
+            tf.reduce_any(lm_mask_bool),
+            lambda: lm_mask_bool,
+            lambda: tf.scatter_nd([[0, 0, 0]], [True], tf.shape(lm_mask_bool)),
+        )
 
-        lm_mask = tf.squeeze(lm_mask, -1)
-        a_t_masked = tf.boolean_mask(a_transformed, lm_mask)
-        a_masked = tf.boolean_mask(a, lm_mask)
+        lm_mask_bool = tf.squeeze(lm_mask_bool, -1)
+        a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
+        a_masked = tf.boolean_mask(a, lm_mask_bool)
 
         a_t_masked_embed = train_utils.create_tf_embed(
             a_t_masked, self.embed_dim, self.C2, "a_transformed", self.similarity_type
@@ -799,7 +802,7 @@ def _train_mask_graph(self, a_transformed, a, lm_mask):
         a_embed = train_utils.create_tf_embed(
             a, self.embed_dim, self.C2, "a", self.similarity_type
         )
-        a_embed_masked = tf.boolean_mask(a_embed, lm_mask)
+        a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
 
         return train_utils.calculate_loss_acc(
             a_t_masked_embed,

From 8b1c923ec754d550fb4553137a4921d16728d846 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 22 Nov 2019 14:49:40 +0100
Subject: [PATCH 049/633] plot also mask training curves

---
 rasa/utils/plotter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/utils/plotter.py b/rasa/utils/plotter.py
index 28840052a27b..515ba7ec38c6 100644
--- a/rasa/utils/plotter.py
+++ b/rasa/utils/plotter.py
@@ -70,6 +70,7 @@ def plot_training_curves(self, file_name: Union[Text], output_folder: Text):
         metrics = {
             "intent": {"scores": ["loss", "acc"], "prefix": "i"},
             "entity": {"scores": ["loss", "f1"], "prefix": "e"},
+            "mask": {"scores": ["loss", "acc"], "prefix": "m"},
         }
 
         for metric_name, metric_values in metrics.items():

From 42f23ba5ae0382a044623893d75c63aeb08c2ce4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 22 Nov 2019 15:05:38 +0100
Subject: [PATCH 050/633] make learning rate configurable

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 55f3c0b48aeb..def7a0454c27 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -102,6 +102,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "random_seed": None,
         # optimizer
         "optimizer": "Adam",  # can be either 'Adam' (default) or 'Nadam'
+        "learning_rate": 0.001,
         "normalize_loss": False,
         # embedding parameters
         # default dense dimension used if no dense features are present
@@ -188,6 +189,7 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
 
         self.optimizer = config["optimizer"]
         self.normalize_loss = config["normalize_loss"]
+        self.learning_rate = config["learning_rate"]
         self.epochs = config["epochs"]
 
         self.random_seed = self.component_config["random_seed"]
@@ -1217,9 +1219,13 @@ def train(
 
             # define which optimizer to use
             if self.optimizer.lower() == "nadam":
-                self._train_op = tf.contrib.opt.NadamOptimizer().minimize(loss)
+                self._train_op = tf.contrib.opt.NadamOptimizer(
+                    learning_rate=self.learning_rate
+                ).minimize(loss)
             else:
-                self._train_op = tf.train.AdamOptimizer().minimize(loss)
+                self._train_op = tf.train.AdamOptimizer(
+                    learning_rate=self.learning_rate
+                ).minimize(loss)
 
             # train tensorflow graph
             self.session = tf.Session(config=self._tf_config)

From 34e76e0d1de8f8686c350de44c86f3505e8b1220 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 22 Nov 2019 16:59:52 +0100
Subject: [PATCH 051/633] add sparse input dropout

---
 .../embedding_intent_classifier.py            | 26 ++++++++++++++++---
 rasa/utils/train_utils.py                     |  8 +++---
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 55f3c0b48aeb..f3fb4035cf96 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -144,6 +144,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # if true named entity recognition is trained and entities predicted
         "named_entity_recognition": True,
         "masked_lm_loss": False,
+        "sparse_input_dropout": False,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -242,6 +243,7 @@ def _load_params(self) -> None:
             "named_entity_recognition"
         ]
         self.masked_lm_loss = self.component_config["masked_lm_loss"]
+        self.sparse_input_dropout = self.component_config["sparse_input_dropout"]
 
     # package safety checks
     @classmethod
@@ -638,6 +640,7 @@ def combine_sparse_dense_features(
         features: List[Union["tf.Tensor", "tf.SparseTensor"]],
         mask: "tf.Tensor",
         name: Text,
+        sparse_dropout: bool = False,
     ) -> "tf.Tensor":
 
         dense_features = []
@@ -651,8 +654,19 @@ def combine_sparse_dense_features(
 
         for f in features:
             if isinstance(f, tf.SparseTensor):
+                if sparse_dropout:
+                    to_retain_prob = tf.random.uniform(
+                        tf.shape(f.values), 0, 1, f.values.dtype
+                    )
+                    to_retain = tf.greater_equal(to_retain_prob, self.droprate)
+                    _f = tf.sparse.retain(f, to_retain)
+                    _f = tf.cond(self._is_training, lambda: _f, lambda: f)
+                else:
+                    _f = f
                 dense_features.append(
-                    train_utils.tf_dense_layer_for_sparse(f, dense_dim, name, self.C2)
+                    train_utils.tf_dense_layer_for_sparse(
+                        f, dense_dim, name, self.C2, input_dim=int(f.shape[-1])
+                    )
                 )
             else:
                 dense_features.append(f)
@@ -663,9 +677,8 @@ def combine_sparse_dense_features(
 
         return tf.concat(dense_features, axis=-1) * mask
 
-    @staticmethod
     def _mask_input(
-        a: "tf.Tensor", mask: "tf.Tensor"
+        self, a: "tf.Tensor", mask: "tf.Tensor"
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Randomly mask input sequences."""
 
@@ -696,6 +709,8 @@ def _mask_input(
         lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
         a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
 
+        a_pre = tf.cond(self._is_training, lambda: a_pre, lambda: a)
+
         return a_pre, lm_mask_bool
 
     def _create_tf_sequence(self, a_in: "tf.Tensor", mask: "tf.Tensor") -> "tf.Tensor":
@@ -742,7 +757,10 @@ def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetric
 
         mask = batch_data["text_mask"][0]
         a = self.combine_sparse_dense_features(
-            batch_data["text_features"], mask, "text"
+            batch_data["text_features"],
+            mask,
+            "text",
+            sparse_dropout=self.sparse_input_dropout,
         )
 
         if self.masked_lm_loss:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b75afa972175..11d45c515d19 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -558,6 +558,7 @@ def tf_dense_layer_for_sparse(
     C2: float,
     activation: Optional[Callable] = tf.nn.relu,
     use_bias: bool = True,
+    input_dim: Optional[int] = None,
 ) -> tf.Tensor:
     """Dense layer for sparse input tensor"""
 
@@ -566,18 +567,17 @@ def tf_dense_layer_for_sparse(
 
     with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
         kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
+        input_dim = input_dim or int(inputs.shape[-1])
         kernel = tf.get_variable(
             "kernel",
-            shape=[inputs.shape[-1], units],
+            shape=[input_dim, units],
             dtype=inputs.dtype,
             regularizer=kernel_regularizer,
         )
         bias = tf.get_variable("bias", shape=[units], dtype=inputs.dtype)
 
         # outputs will be 2D
-        outputs = tf.sparse.matmul(
-            tf.sparse.reshape(inputs, [-1, int(inputs.shape[-1])]), kernel
-        )
+        outputs = tf.sparse.matmul(tf.sparse.reshape(inputs, [-1, input_dim]), kernel)
 
         if len(inputs.shape) == 3:
             # reshape back

From 8406bebe2d4f63f7934712d6ef3d3c8eda2b510e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 22 Nov 2019 18:40:56 +0100
Subject: [PATCH 052/633] fix sparse input dropout

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 481563f6bb93..32d45c850bd2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -665,9 +665,10 @@ def combine_sparse_dense_features(
                     _f = tf.cond(self._is_training, lambda: _f, lambda: f)
                 else:
                     _f = f
+
                 dense_features.append(
                     train_utils.tf_dense_layer_for_sparse(
-                        f, dense_dim, name, self.C2, input_dim=int(f.shape[-1])
+                        _f, dense_dim, name, self.C2, input_dim=int(f.shape[-1])
                     )
                 )
             else:

From ca7d618a210f4f0a08ac9a1abc424d81a450906f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 23 Nov 2019 12:48:44 +0100
Subject: [PATCH 053/633] only down scale loss

---
 rasa/utils/train_utils.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 11d45c515d19..265d62bc63c9 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1004,12 +1004,8 @@ def tf_loss_softmax(
 
     if scale_loss:
         # mask loss by prediction confidence
-        pred = tf.nn.softmax(logits)
-        if len(pred.shape) == 3:
-            pos_pred = pred[:, :, 0]
-        else:  # len(pred.shape) == 2
-            pos_pred = pred[:, 0]
-        mask *= tf.pow((1 - pos_pred) / 0.5, 4)
+        pos_pred = tf.nn.softmax(logits)[..., 0]
+        mask *= tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
 
     loss = tf.losses.softmax_cross_entropy(labels, logits, mask)
     # add regularization losses

From c97871234f502253542ba1c77cc255cd4464ce7e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 23 Nov 2019 12:49:00 +0100
Subject: [PATCH 054/633] change defaults to 256 and 2

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 32d45c850bd2..2abba0e80dd5 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -81,9 +81,9 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # Whether to share the hidden layer weights between input words and labels
         "share_hidden_layers": False,
         # number of units in transformer
-        "transformer_size": 128,
+        "transformer_size": 256,
         # number of transformer layers
-        "num_transformer_layers": 1,
+        "num_transformer_layers": 2,
         # number of attention heads in transformer
         "num_heads": 4,
         # type of positional encoding in transformer

From 2efb02b4c21b4ca9285cd8437e801e6f8422ddf6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 25 Nov 2019 16:21:33 +0100
Subject: [PATCH 055/633] clean up after merge

---
 .../embedding_intent_classifier.py            |   1 -
 rasa/nlu/featurizers/convert_featurizer.py    | 117 ------------
 .../dense_featurizer/convert_featurizer.py    | 177 +++++-------------
 rasa/nlu/registry.py                          |   8 +-
 4 files changed, 50 insertions(+), 253 deletions(-)
 delete mode 100644 rasa/nlu/featurizers/convert_featurizer.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2abba0e80dd5..ed1d8cb837c7 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1009,7 +1009,6 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         Performs sanity checks on training data, extracts encodings for labels.
         """
-
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
diff --git a/rasa/nlu/featurizers/convert_featurizer.py b/rasa/nlu/featurizers/convert_featurizer.py
deleted file mode 100644
index 614feeef3b1a..000000000000
--- a/rasa/nlu/featurizers/convert_featurizer.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import logging
-from rasa.nlu.featurizers import Featurizer
-from typing import Any, Dict, List, Optional, Text, Tuple
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_FEATURE_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
-)
-import numpy as np
-import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-
-class ConveRTFeaturizer(Featurizer):
-
-    provides = [
-        MESSAGE_VECTOR_FEATURE_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    def _load_model(self) -> None:
-
-        import tensorflow_text
-        import tensorflow_hub as tfhub
-
-        self.graph = tf.Graph()
-        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-
-        with self.graph.as_default():
-            self.session = tf.Session()
-            self.module = tfhub.Module(model_url)
-
-            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.encoding_tensor = self.module(self.text_placeholder)
-            self.session.run(tf.tables_initializer())
-            self.session.run(tf.global_variables_initializer())
-
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-
-        super(ConveRTFeaturizer, self).__init__(component_config)
-
-        self._load_model()
-
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["tensorflow_text", "tensorflow_hub"]
-
-    def _compute_features(
-        self, batch_examples: List[Message], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> np.ndarray:
-
-        # Get text for attribute of each example
-        batch_attribute_text = [ex.get(attribute) for ex in batch_examples]
-
-        batch_features = self._run_model_on_text(batch_attribute_text)
-
-        return batch_features
-
-    def _run_model_on_text(self, batch: List[Text]) -> np.ndarray:
-
-        return self.session.run(
-            self.encoding_tensor, feed_dict={self.text_placeholder: batch}
-        )
-
-    def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
-        **kwargs: Any,
-    ) -> None:
-
-        batch_size = 64
-
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
-
-            non_empty_examples = list(
-                filter(lambda x: x.get(attribute), training_data.training_examples)
-            )
-
-            batch_start_index = 0
-
-            while batch_start_index < len(non_empty_examples):
-
-                batch_end_index = min(
-                    batch_start_index + batch_size, len(non_empty_examples)
-                )
-
-                # Collect batch examples
-                batch_examples = non_empty_examples[batch_start_index:batch_end_index]
-
-                batch_features = self._compute_features(batch_examples, attribute)
-
-                for index, ex in enumerate(batch_examples):
-
-                    ex.set(
-                        MESSAGE_VECTOR_FEATURE_NAMES[attribute],
-                        self._combine_with_existing_features(
-                            ex,
-                            batch_features[index],
-                            MESSAGE_VECTOR_FEATURE_NAMES[attribute],
-                        ),
-                    )
-
-                batch_start_index += batch_size
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-
-        feats = self._compute_features([message])[0]
-        message.set(
-            MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self._combine_with_existing_features(
-                message, feats, MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-            ),
-        )
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 43b418b1b570..0dbcfc179ebd 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,7 +1,7 @@
 import logging
-import re
+from typing import Any, Dict, List, Optional, Text, Tuple
+
 from rasa.nlu.featurizers.featurzier import Featurizer
-from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
@@ -11,28 +11,21 @@
 )
 import numpy as np
 import tensorflow as tf
-import tensorflow_hub as tfhub
-
-# needed to load convert model
-import tensorflow_text
 
 logger = logging.getLogger(__name__)
 
 
-class ConvertFeaturizer(Featurizer):
+class ConveRTFeaturizer(Featurizer):
+
     provides = [
         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    defaults = {
-        # model key identified by HF Transformers
-        "return_sequence": True
-    }
+    def _load_model(self) -> None:
 
-    def _load_model(self):
-
-        self.return_sequence = self.component_config["return_sequence"]
+        import tensorflow_text
+        import tensorflow_hub as tfhub
 
         self.graph = tf.Graph()
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
@@ -42,23 +35,37 @@ def _load_model(self):
             self.module = tfhub.Module(model_url)
 
             self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            if self.return_sequence:
-                self.sequence_encoding_tensor = self.module(
-                    self.text_placeholder, signature="encode_sequence", as_dict=True
-                )
-                self.tokenized = self.module(
-                    self.text_placeholder, signature="tokenize"
-                )
-            self.sentence_encoding_tensor = self.module(self.text_placeholder)
+            self.encoding_tensor = self.module(self.text_placeholder)
             self.session.run(tf.tables_initializer())
             self.session.run(tf.global_variables_initializer())
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
-        super(ConvertFeaturizer, self).__init__(component_config)
+        super(ConveRTFeaturizer, self).__init__(component_config)
 
         self._load_model()
 
+    @classmethod
+    def required_packages(cls) -> List[Text]:
+        return ["tensorflow_text", "tensorflow_hub"]
+
+    def _compute_features(
+        self, batch_examples: List[Message], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> np.ndarray:
+
+        # Get text for attribute of each example
+        batch_attribute_text = [ex.get(attribute) for ex in batch_examples]
+
+        batch_features = self._run_model_on_text(batch_attribute_text)
+
+        return batch_features
+
+    def _run_model_on_text(self, batch: List[Text]) -> np.ndarray:
+
+        return self.session.run(
+            self.encoding_tensor, feed_dict={self.text_placeholder: batch}
+        )
+
     def train(
         self,
         training_data: TrainingData,
@@ -66,132 +73,43 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        bs = 64
+        batch_size = 64
 
-        for attribute in [MESSAGE_TEXT_ATTRIBUTE]:
+        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
 
-            start_index = 0
+            non_empty_examples = list(
+                filter(lambda x: x.get(attribute), training_data.training_examples)
+            )
+
+            batch_start_index = 0
 
-            while start_index < len(training_data.intent_examples):
+            while batch_start_index < len(non_empty_examples):
 
-                end_index = min(start_index + bs, len(training_data.intent_examples))
-                batch_examples = training_data.intent_examples[start_index:end_index]
+                batch_end_index = min(
+                    batch_start_index + batch_size, len(non_empty_examples)
+                )
 
-                batch_text = [
-                    self._clean_text(ex.get(attribute)) for ex in batch_examples
-                ]
+                # Collect batch examples
+                batch_examples = non_empty_examples[batch_start_index:batch_end_index]
 
-                batch_feats = self._compute_features(batch_text)
+                batch_features = self._compute_features(batch_examples, attribute)
 
                 for index, ex in enumerate(batch_examples):
 
-                    # print(batch_text[index], batch_feats[index].shape)
                     ex.set(
                         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
                             ex,
-                            batch_feats[index],
+                            batch_features[index],
                             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
 
-                start_index += bs
-
-    @staticmethod
-    def _clean_text(text):
-
-        cleaned_text = re.sub(
-            # there is a space or an end of a string after it
-            r"[^\w#@&]+(?=\s|$)|"
-            # there is a space or beginning of a string before it
-            # not followed by a number
-            r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
-            # not in between numbers and not . or @ or & or - or #
-            # e.g. 10'000.00 or blabla@gmail.com
-            # and not url characters
-            r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
-            " ",
-            text,
-        )
-
-        # remove multiple occurences of ' '
-        cleaned_text = re.sub(" +", " ", cleaned_text)
-
-        # remove " & "
-        cleaned_text = re.sub("\s&\s", " ", cleaned_text)
-
-        if not cleaned_text.strip():
-            cleaned_text = text
-
-        return cleaned_text.strip()
-
-    def _tokenize(self, sentence):
-
-        return self.session.run(
-            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
-        )
-
-    def _compute_features(self, batch_examples):
-
-        sentence_encodings = self.session.run(
-            self.sentence_encoding_tensor,
-            feed_dict={self.text_placeholder: batch_examples},
-        )
-
-        # convert them to a sequence
-        sentence_encodings = np.reshape(
-            sentence_encodings, (len(batch_examples), 1, -1)
-        )
-
-        if self.return_sequence:
-
-            final_embeddings = []
-
-            batch_tokenized = [self._tokenize(sentence) for sentence in batch_examples]
-
-            actual_lens = [token_vector.shape[1] for token_vector in batch_tokenized]
-
-            sequence_encodings = self.session.run(
-                self.sequence_encoding_tensor,
-                feed_dict={self.text_placeholder: batch_examples},
-            )["sequence_encoding"]
-
-            for index in range(len(batch_examples)):
-
-                seq_len = actual_lens[index]
-                seq_enc = sequence_encodings[index][:seq_len]
-                sent_enc = sentence_encodings[index]
-
-                # tile seq enc to duplicate
-                seq_enc = np.tile(seq_enc, (1, 2))
-
-                # add sent_enc to the end
-                seq_enc = np.concatenate([seq_enc, sent_enc], axis=0)
-
-                final_embeddings.append(seq_enc)
-
-            return final_embeddings
-
-        return sentence_encodings
-
-    def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
-
-        message_attribute_text = example.get(attribute)
-        if message_attribute_text:
-            # Encode text
-            features = self.module([message_attribute_text])[0]
-            features = self._combine_with_existing_features(
-                example, features, MESSAGE_VECTOR_FEATURE_NAMES[attribute]
-            )
-            # print(features.shape)
-            example.set(MESSAGE_VECTOR_FEATURE_NAMES[attribute], features)
+                batch_start_index += batch_size
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        feats = self._compute_features(
-            [self._clean_text(message.get(MESSAGE_TEXT_ATTRIBUTE))]
-        )[0]
-
+        feats = self._compute_features([message])[0]
         message.set(
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
@@ -200,4 +118,3 @@ def process(self, message: Message, **kwargs: Any) -> None:
                 MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )
-        # self._set_lm_features(message)
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 8bf0c1f1dfa8..76801af1996a 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -9,9 +9,8 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Type
 
-from rasa.nlu.featurizers.dense_featurizer.pretrained_lm_featurizer import (
-    PreTrainedLMFeaturizer,
-)
+from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
+from rasa.nlu.tokenizers.convert_tokenizer import ConvertTokenizer
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
@@ -32,14 +31,12 @@
 from rasa.nlu.featurizers.dense_featurizer.pretrained_lm_featurizer import (
     PreTrainedLMFeaturizer,
 )
-from rasa.nlu.featurizers.convert_featurizer import ConveRTFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.tokenizers.pretrained_lm_tokenizer import PreTrainedLMTokenizer
-from rasa.nlu.tokenizers.convert_tokenizer import ConvertTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.utils.common import class_from_module_path
@@ -112,6 +109,7 @@
     "intent_classifier_mitie": "MitieIntentClassifier",
     "intent_classifier_keyword": "KeywordIntentClassifier",
     "intent_classifier_tensorflow_embedding": "EmbeddingIntentClassifier",
+    "ConvertFeaturizer": "ConveRTFeaturizer",
 }
 
 # To simplify usage, there are a couple of model templates, that already add

From 9928cab66b7d400f0ebd64c45a6393e2a169e3b4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 25 Nov 2019 17:02:58 +0100
Subject: [PATCH 056/633] revert convert featurizer

---
 .../dense_featurizer/convert_featurizer.py    | 177 +++++++++++++-----
 rasa/nlu/registry.py                          |   5 +-
 2 files changed, 132 insertions(+), 50 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 0dbcfc179ebd..b268aefc8d2e 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,7 +1,7 @@
 import logging
-from typing import Any, Dict, List, Optional, Text, Tuple
-
+import re
 from rasa.nlu.featurizers.featurzier import Featurizer
+from typing import Any, Dict, Optional, Text
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
@@ -11,21 +11,28 @@
 )
 import numpy as np
 import tensorflow as tf
+import tensorflow_hub as tfhub
 
-logger = logging.getLogger(__name__)
+# needed to load convert model
+import tensorflow_text
 
+logger = logging.getLogger(__name__)
 
-class ConveRTFeaturizer(Featurizer):
 
+class ConvertFeaturizer(Featurizer):
     provides = [
         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    def _load_model(self) -> None:
+    defaults = {
+        # model key identified by HF Transformers
+        "return_sequence": True
+    }
 
-        import tensorflow_text
-        import tensorflow_hub as tfhub
+    def _load_model(self):
+
+        self.return_sequence = self.component_config["return_sequence"]
 
         self.graph = tf.Graph()
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
@@ -35,37 +42,23 @@ def _load_model(self) -> None:
             self.module = tfhub.Module(model_url)
 
             self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.encoding_tensor = self.module(self.text_placeholder)
+            if self.return_sequence:
+                self.sequence_encoding_tensor = self.module(
+                    self.text_placeholder, signature="encode_sequence", as_dict=True
+                )
+                self.tokenized = self.module(
+                    self.text_placeholder, signature="tokenize"
+                )
+            self.sentence_encoding_tensor = self.module(self.text_placeholder)
             self.session.run(tf.tables_initializer())
             self.session.run(tf.global_variables_initializer())
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
-        super(ConveRTFeaturizer, self).__init__(component_config)
+        super(ConvertFeaturizer, self).__init__(component_config)
 
         self._load_model()
 
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["tensorflow_text", "tensorflow_hub"]
-
-    def _compute_features(
-        self, batch_examples: List[Message], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> np.ndarray:
-
-        # Get text for attribute of each example
-        batch_attribute_text = [ex.get(attribute) for ex in batch_examples]
-
-        batch_features = self._run_model_on_text(batch_attribute_text)
-
-        return batch_features
-
-    def _run_model_on_text(self, batch: List[Text]) -> np.ndarray:
-
-        return self.session.run(
-            self.encoding_tensor, feed_dict={self.text_placeholder: batch}
-        )
-
     def train(
         self,
         training_data: TrainingData,
@@ -73,43 +66,132 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        batch_size = 64
+        bs = 64
 
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+        for attribute in [MESSAGE_TEXT_ATTRIBUTE]:
 
-            non_empty_examples = list(
-                filter(lambda x: x.get(attribute), training_data.training_examples)
-            )
-
-            batch_start_index = 0
+            start_index = 0
 
-            while batch_start_index < len(non_empty_examples):
+            while start_index < len(training_data.intent_examples):
 
-                batch_end_index = min(
-                    batch_start_index + batch_size, len(non_empty_examples)
-                )
+                end_index = min(start_index + bs, len(training_data.intent_examples))
+                batch_examples = training_data.intent_examples[start_index:end_index]
 
-                # Collect batch examples
-                batch_examples = non_empty_examples[batch_start_index:batch_end_index]
+                batch_text = [
+                    self._clean_text(ex.get(attribute)) for ex in batch_examples
+                ]
 
-                batch_features = self._compute_features(batch_examples, attribute)
+                batch_feats = self._compute_features(batch_text)
 
                 for index, ex in enumerate(batch_examples):
 
+                    # print(batch_text[index], batch_feats[index].shape)
                     ex.set(
                         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
                             ex,
-                            batch_features[index],
+                            batch_feats[index],
                             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
 
-                batch_start_index += batch_size
+                start_index += bs
+
+    @staticmethod
+    def _clean_text(text):
+
+        cleaned_text = re.sub(
+            # there is a space or an end of a string after it
+            r"[^\w#@&]+(?=\s|$)|"
+            # there is a space or beginning of a string before it
+            # not followed by a number
+            r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
+            # not in between numbers and not . or @ or & or - or #
+            # e.g. 10'000.00 or blabla@gmail.com
+            # and not url characters
+            r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
+            " ",
+            text,
+        )
+
+        # remove multiple occurences of ' '
+        cleaned_text = re.sub(" +", " ", cleaned_text)
+
+        # remove " & "
+        cleaned_text = re.sub("\s&\s", " ", cleaned_text)
+
+        if not cleaned_text.strip():
+            cleaned_text = text
+
+        return cleaned_text.strip()
+
+    def _tokenize(self, sentence):
+
+        return self.session.run(
+            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
+        )
+
+    def _compute_features(self, batch_examples):
+
+        sentence_encodings = self.session.run(
+            self.sentence_encoding_tensor,
+            feed_dict={self.text_placeholder: batch_examples},
+        )
+
+        # convert them to a sequence
+        sentence_encodings = np.reshape(
+            sentence_encodings, (len(batch_examples), 1, -1)
+        )
+
+        if self.return_sequence:
+
+            final_embeddings = []
+
+            batch_tokenized = [self._tokenize(sentence) for sentence in batch_examples]
+
+            actual_lens = [token_vector.shape[1] for token_vector in batch_tokenized]
+
+            sequence_encodings = self.session.run(
+                self.sequence_encoding_tensor,
+                feed_dict={self.text_placeholder: batch_examples},
+            )["sequence_encoding"]
+
+            for index in range(len(batch_examples)):
+
+                seq_len = actual_lens[index]
+                seq_enc = sequence_encodings[index][:seq_len]
+                sent_enc = sentence_encodings[index]
+
+                # tile seq enc to duplicate
+                seq_enc = np.tile(seq_enc, (1, 2))
+
+                # add sent_enc to the end
+                seq_enc = np.concatenate([seq_enc, sent_enc], axis=0)
+
+                final_embeddings.append(seq_enc)
+
+            return final_embeddings
+
+        return sentence_encodings
+
+    def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
+
+        message_attribute_text = example.get(attribute)
+        if message_attribute_text:
+            # Encode text
+            features = self.module([message_attribute_text])[0]
+            features = self._combine_with_existing_dense_features(
+                example, features, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+            )
+            # print(features.shape)
+            example.set(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute], features)
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        feats = self._compute_features([message])[0]
+        feats = self._compute_features(
+            [self._clean_text(message.get(MESSAGE_TEXT_ATTRIBUTE))]
+        )[0]
+
         message.set(
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
@@ -118,3 +200,4 @@ def process(self, message: Message, **kwargs: Any) -> None:
                 MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )
+        # self._set_lm_features(message)
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 76801af1996a..69d32edd2dfb 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -9,7 +9,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Type
 
-from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
+from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConvertFeaturizer
 from rasa.nlu.tokenizers.convert_tokenizer import ConvertTokenizer
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
@@ -74,7 +74,7 @@
     RegexFeaturizer,
     CountVectorsFeaturizer,
     PreTrainedLMFeaturizer,
-    ConveRTFeaturizer,
+    ConvertFeaturizer,
     # classifiers
     SklearnIntentClassifier,
     MitieIntentClassifier,
@@ -109,7 +109,6 @@
     "intent_classifier_mitie": "MitieIntentClassifier",
     "intent_classifier_keyword": "KeywordIntentClassifier",
     "intent_classifier_tensorflow_embedding": "EmbeddingIntentClassifier",
-    "ConvertFeaturizer": "ConveRTFeaturizer",
 }
 
 # To simplify usage, there are a couple of model templates, that already add

From 77465df31fbdd99f2cb73f61314b68d26bc9598c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 25 Nov 2019 17:33:51 +0100
Subject: [PATCH 057/633] add bilou flag to embedding intent classifier

---
 .../embedding_intent_classifier.py            | 85 ++++++++++++++++---
 1 file changed, 74 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ed1d8cb837c7..aa7c163b46fc 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -10,6 +10,7 @@
 from typing import Any, Dict, List, Optional, Text, Tuple, Union
 from shutil import copyfile
 
+from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 from tf_metrics import f1
 
 import rasa.utils.io as io_utils
@@ -43,6 +44,9 @@
     from rasa.nlu.training_data import Message
 
 
+MESSAGE_BILOU_ENTITIES_ATTRIBUTE = "BILOU_entities"
+
+
 class EmbeddingIntentClassifier(EntityExtractor):
     """Intent classifier using supervised embeddings.
 
@@ -146,6 +150,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "named_entity_recognition": True,
         "masked_lm_loss": False,
         "sparse_input_dropout": False,
+        "bilou_flag": False,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -246,6 +251,7 @@ def _load_params(self) -> None:
         ]
         self.masked_lm_loss = self.component_config["masked_lm_loss"]
         self.sparse_input_dropout = self.component_config["sparse_input_dropout"]
+        self.bilou_flag = self.component_config["bilou_flag"]
 
     # package safety checks
     @classmethod
@@ -328,21 +334,43 @@ def _create_label_id_dict(
 
     @staticmethod
     def _create_tag_id_dict(
-        training_data: "TrainingData", attribute: Text
+        training_data: "TrainingData", bilou_flag: bool
     ) -> Dict[Text, int]:
         """Create label_id dictionary"""
 
+        if bilou_flag:
+            bilou_prefix = ["B-", "I-", "L-", "U-"]
+            distinct_tag_ids = set(
+                [
+                    e[2:]
+                    for example in training_data.training_examples
+                    if example.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE)
+                    for e in example.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE)
+                ]
+            ) - {""}
+
+            tag_id_dict = {
+                f"{prefix}{tag_id}": idx_1 * len(bilou_prefix) + idx_2
+                for idx_1, tag_id in enumerate(sorted(distinct_tag_ids))
+                for idx_2, prefix in enumerate(bilou_prefix)
+            }
+            tag_id_dict["O"] = len(distinct_tag_ids) * len(bilou_prefix)
+
+            return tag_id_dict
+
         distinct_tag_ids = set(
             [
                 e["entity"]
                 for example in training_data.entity_examples
-                for e in example.get(attribute)
+                for e in example.get(MESSAGE_ENTITIES_ATTRIBUTE)
             ]
         ) - {None}
+
         tag_id_dict = {
             tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
         }
         tag_id_dict["O"] = 0
+
         return tag_id_dict
 
     @staticmethod
@@ -578,12 +606,26 @@ def _create_session_data(
                 label_ids.append(label_id_dict[e.get(label_attribute)])
 
             if self.named_entity_recognition and tag_id_dict:
-                _tags = []
-                for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
-                    _tag = determine_token_labels(
-                        t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
-                    )
-                    _tags.append(tag_id_dict[_tag])
+                if self.bilou_flag:
+                    if e.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE):
+                        _tags = [
+                            tag_id_dict[_tag]
+                            if _tag in tag_id_dict
+                            else tag_id_dict["O"]
+                            for _tag in e.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE)
+                        ]
+                    else:
+                        _tags = [
+                            tag_id_dict["O"]
+                            for _ in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                        ]
+                else:
+                    _tags = []
+                    for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
+                        _tag = determine_token_labels(
+                            t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
+                        )
+                        _tags.append(tag_id_dict[_tag])
                 # transpose to have seq_len x 1
                 tag_ids.append(np.array([_tags]).T)
 
@@ -1009,6 +1051,9 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         Performs sanity checks on training data, extracts encodings for labels.
         """
+        if self.bilou_flag:
+            self.apply_bilou_schema(training_data)
+
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
@@ -1018,9 +1063,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
             training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 
-        tag_id_dict = self._create_tag_id_dict(
-            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
-        )
+        tag_id_dict = self._create_tag_id_dict(training_data, self.bilou_flag)
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         session_data = self._create_session_data(
@@ -1036,6 +1079,23 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         return session_data
 
+    def apply_bilou_schema(self, training_data: "TrainingData"):
+        if not self.named_entity_recognition:
+            return
+
+        for example in training_data.training_examples:
+            entities = example.get(MESSAGE_ENTITIES_ATTRIBUTE)
+
+            if not entities:
+                continue
+
+            entities = CRFEntityExtractor._convert_example(example)
+            output = CRFEntityExtractor._bilou_tags_from_offsets(
+                example.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]), entities
+            )
+
+            example.set(MESSAGE_BILOU_ENTITIES_ATTRIBUTE, output)
+
     # process helpers
     def predict_label(
         self, message: "Message"
@@ -1123,6 +1183,9 @@ def predict_entities(self, message: "Message") -> List[Dict]:
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
 
+        if self.bilou_flag:
+            tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
+
         entities = self._convert_tags_to_entities(
             message.text, message.get("tokens", []), tags
         )

From a41d1f1f6dc9ae5dc3665c47e3b178df5cf63ee8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 17:45:22 +0100
Subject: [PATCH 058/633] use mean vector as cls vector for glove

---
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 75b8f486a6c9..0e7717663abd 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -66,7 +66,7 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
             if cls_token_used:
                 # cls token is used, need to append a vector
-                cls_token_vec = np.zeros([1, fs.shape[-1]])
+                cls_token_vec = np.mean(fs, axis=0, keepdims=True)
                 fs = np.concatenate([fs, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(

From d30adc9d16f06f420847163c3923b5b2a8cf5c7c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 17:49:31 +0100
Subject: [PATCH 059/633] make mean vector configurable

---
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 0e7717663abd..2d762cacafea 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -31,6 +31,8 @@ class SpacyFeaturizer(Featurizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ] + [MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
 
+    defaults = {"use_mean_vec": False}
+
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence."""
         return np.array([t.vector for t in doc])
@@ -66,7 +68,11 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
             if cls_token_used:
                 # cls token is used, need to append a vector
-                cls_token_vec = np.mean(fs, axis=0, keepdims=True)
+                if self.component_config["use_mean_vec"]:
+                    cls_token_vec = np.mean(fs, axis=0, keepdims=True)
+                else:
+                    cls_token_vec = np.zeros([1, fs.shape[-1]])
+
                 fs = np.concatenate([fs, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(

From 42851102ccaac23159ab56c26a0c945f1f38f417 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 29 Nov 2019 19:21:29 +0100
Subject: [PATCH 060/633] don't include entity score for cls

---
 .../nlu/classifiers/embedding_intent_classifier.py | 14 +++++++++-----
 .../dense_featurizer/spacy_featurizer.py           |  6 +++++-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index aa7c163b46fc..18e9e3b8dcc9 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -727,7 +727,7 @@ def _mask_input(
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Randomly mask input sequences."""
 
-        # do not mask cls token
+        # do not substitute with cls token
         pad_mask_up_to_last = tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
         mask_up_to_last = 1 - pad_mask_up_to_last
 
@@ -749,7 +749,7 @@ def _mask_input(
         )
 
         lm_mask_prob = (
-            tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask_up_to_last
+            tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
         )
         lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
         a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
@@ -886,7 +886,8 @@ def _train_mask_graph(self, a_transformed, a, lm_mask_bool):
     def _train_entity_graph(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
+        mask_up_to_last = 1 - tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
         sequence_lengths.set_shape([mask.shape[0]])
 
         c = tf.reduce_sum(tf.nn.relu(c), -1)
@@ -907,7 +908,8 @@ def _train_entity_graph(
         loss = tf.reduce_mean(-log_likelihood)
 
         # calculate f1 score for train predictions
-        weights = tf.sequence_mask(sequence_lengths)
+        # weights = tf.sequence_mask(sequence_lengths)
+        weights = tf.cast(mask_up_to_last, tf.int32)
         pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
         score = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
 
@@ -1039,7 +1041,9 @@ def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
         )
 
     def _pred_entity_graph(self, a: "tf.Tensor", mask: "tf.Tensor"):
-        sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
+        mask_up_to_last = 1 - tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
+        # sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
 
         # predict tagsx
         _, _, pred_ids = self._create_crf(a, sequence_lengths)
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 75b8f486a6c9..a3116696e185 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -66,7 +66,11 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
             if cls_token_used:
                 # cls token is used, need to append a vector
-                cls_token_vec = np.zeros([1, fs.shape[-1]])
+                # print(fs.shape)
+                cls_token_vec = np.mean(fs, axis=0, keepdims=True)
+                # print(cls_token_vec.shape)
+                # exit()
+                # cls_token_vec = np.zeros([1, fs.shape[-1]])
                 fs = np.concatenate([fs, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(

From 605320e7f44990b25d864d3f0b4c167ab56200fc Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 2 Dec 2019 18:02:29 +0100
Subject: [PATCH 061/633] clean up a bit

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 18e9e3b8dcc9..276e885563d0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -908,10 +908,8 @@ def _train_entity_graph(
         loss = tf.reduce_mean(-log_likelihood)
 
         # calculate f1 score for train predictions
-        # weights = tf.sequence_mask(sequence_lengths)
-        weights = tf.cast(mask_up_to_last, tf.int32)
         pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
-        score = f1(c, pred_ids, self.num_tags, pos_tag_indices, weights)
+        score = f1(c, pred_ids, self.num_tags, pos_tag_indices, mask_up_to_last)
 
         return loss, score[1]
 
@@ -1043,7 +1041,6 @@ def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
     def _pred_entity_graph(self, a: "tf.Tensor", mask: "tf.Tensor"):
         mask_up_to_last = 1 - tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
         sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
-        # sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
 
         # predict tagsx
         _, _, pred_ids = self._create_crf(a, sequence_lengths)

From 92dfb082f3e261e8f8987863efe2311525f15bfc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Dec 2019 08:52:01 +0100
Subject: [PATCH 062/633] fix bilou tagging

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 276e885563d0..4c8f887b2f98 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -350,11 +350,13 @@ def _create_tag_id_dict(
             ) - {""}
 
             tag_id_dict = {
-                f"{prefix}{tag_id}": idx_1 * len(bilou_prefix) + idx_2
+                f"{prefix}{tag_id}": idx_1 * len(bilou_prefix) + idx_2 + 1
                 for idx_1, tag_id in enumerate(sorted(distinct_tag_ids))
                 for idx_2, prefix in enumerate(bilou_prefix)
             }
-            tag_id_dict["O"] = len(distinct_tag_ids) * len(bilou_prefix)
+            tag_id_dict["O"] = 0
+
+            print(tag_id_dict)
 
             return tag_id_dict
 
@@ -748,9 +750,7 @@ def _mask_input(
             other_prob < 0.70, a_mask, tf.where(other_prob < 0.80, a_shuffle, a)
         )
 
-        lm_mask_prob = (
-            tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
-        )
+        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
         lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
         a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
 

From 028f5747579f6c811ffa399ef88ff6ae03e83642 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 18 Dec 2019 12:25:14 +0100
Subject: [PATCH 063/633] train runs without errors

---
 .../embedding_intent_classifier.py            | 885 ++++++++++--------
 .../selectors/embedding_response_selector.py  |   5 -
 rasa/utils/tf_layers.py                       | 340 +++++++
 rasa/utils/train_utils.py                     | 472 ++++------
 setup.py                                      |  13 +-
 5 files changed, 1003 insertions(+), 712 deletions(-)
 create mode 100644 rasa/utils/tf_layers.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4c8f887b2f98..f141426267fa 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -11,7 +11,6 @@
 from shutil import copyfile
 
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-from tf_metrics import f1
 
 import rasa.utils.io as io_utils
 from rasa.utils.plotter import Plotter
@@ -20,6 +19,7 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.utils import train_utils
+from rasa.utils import tf_layers
 from rasa.utils.train_utils import SessionDataType, TrainingMetrics
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
@@ -31,9 +31,7 @@
 )
 
 import tensorflow as tf
-
-# avoid warning println on contrib import - remove for tf 2
-tf.contrib._warning = None
+import tensorflow_addons as tfa
 
 logger = logging.getLogger(__name__)
 
@@ -45,6 +43,7 @@
 
 
 MESSAGE_BILOU_ENTITIES_ATTRIBUTE = "BILOU_entities"
+shapes, types = None, None
 
 
 class EmbeddingIntentClassifier(EntityExtractor):
@@ -309,7 +308,6 @@ def __init__(
         self._iterator = None
         self._train_op = None
         self._is_training = None
-        self._in_layer_norm = {}
 
         # number of entity tags
         self.num_tags = 0
@@ -659,322 +657,6 @@ def _create_session_data(
 
         return session_data
 
-    # tf helpers:
-    def _create_tf_embed_fnn(
-        self,
-        x_in: "tf.Tensor",
-        layer_sizes: List[int],
-        fnn_name: Text,
-        embed_name: Text,
-    ) -> "tf.Tensor":
-        """Create nn with hidden layers and name"""
-
-        x = train_utils.create_tf_fnn(
-            x_in,
-            layer_sizes,
-            self.droprate,
-            self.C2,
-            self._is_training,
-            layer_name_suffix=fnn_name,
-        )
-        return train_utils.create_tf_embed(
-            x, self.embed_dim, self.C2, embed_name, self.similarity_type
-        )
-
-    def combine_sparse_dense_features(
-        self,
-        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-        mask: "tf.Tensor",
-        name: Text,
-        sparse_dropout: bool = False,
-    ) -> "tf.Tensor":
-
-        dense_features = []
-
-        dense_dim = self.dense_dim
-        # if dense features are present use the feature dimension of the dense features
-        for f in features:
-            if not isinstance(f, tf.SparseTensor):
-                dense_dim = f.shape[-1]
-                break
-
-        for f in features:
-            if isinstance(f, tf.SparseTensor):
-                if sparse_dropout:
-                    to_retain_prob = tf.random.uniform(
-                        tf.shape(f.values), 0, 1, f.values.dtype
-                    )
-                    to_retain = tf.greater_equal(to_retain_prob, self.droprate)
-                    _f = tf.sparse.retain(f, to_retain)
-                    _f = tf.cond(self._is_training, lambda: _f, lambda: f)
-                else:
-                    _f = f
-
-                dense_features.append(
-                    train_utils.tf_dense_layer_for_sparse(
-                        _f, dense_dim, name, self.C2, input_dim=int(f.shape[-1])
-                    )
-                )
-            else:
-                dense_features.append(f)
-
-        # if self._in_layer_norm.get(name) is None:
-        #     self._in_layer_norm[name] = tf.keras.layers.LayerNormalization(name=name)
-        # return self._in_layer_norm[name](tf.concat(dense_features, axis=-1))
-
-        return tf.concat(dense_features, axis=-1) * mask
-
-    def _mask_input(
-        self, a: "tf.Tensor", mask: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        """Randomly mask input sequences."""
-
-        # do not substitute with cls token
-        pad_mask_up_to_last = tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        mask_up_to_last = 1 - pad_mask_up_to_last
-
-        a_random_pad = (
-            tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
-            * pad_mask_up_to_last
-        )
-        a_shuffle = tf.stop_gradient(
-            tf.random.shuffle(a * mask_up_to_last + a_random_pad)
-        )
-
-        mask_vector = tf.get_variable("mask_vector", (1, 1, a.shape[-1]), a.dtype)
-        a_mask = tf.tile(mask_vector, (tf.shape(a)[0], tf.shape(a)[1], 1))
-
-        other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
-        other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
-        a_other = tf.where(
-            other_prob < 0.70, a_mask, tf.where(other_prob < 0.80, a_shuffle, a)
-        )
-
-        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
-        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
-        a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
-
-        a_pre = tf.cond(self._is_training, lambda: a_pre, lambda: a)
-
-        return a_pre, lm_mask_bool
-
-    def _create_tf_sequence(self, a_in: "tf.Tensor", mask: "tf.Tensor") -> "tf.Tensor":
-        """Create sequence level embedding and mask."""
-
-        a_in = train_utils.create_tf_fnn(
-            a_in,
-            self.hidden_layer_sizes["text"],
-            self.droprate,
-            self.C2,
-            self._is_training,
-            layer_name_suffix="text_intent" if self.share_hidden_layers else "text",
-        )
-
-        self.attention_weights = {}
-        hparams = train_utils.create_t2t_hparams(
-            self.num_transformer_layers,
-            self.transformer_size,
-            self.num_heads,
-            self.droprate,
-            self.pos_encoding,
-            self.max_seq_length,
-            self._is_training,
-            self.unidirectional_encoder,
-        )
-
-        a = train_utils.create_t2t_transformer_encoder(
-            a_in, mask, self.attention_weights, hparams, self.C2, self._is_training
-        )
-
-        return a
-
-    # build tf graphs:
-    def _build_tf_train_graph(self, session_data: SessionDataType) -> TrainingMetrics:
-
-        # get in tensors from generator
-        self.batch_in = self._iterator.get_next()
-        # convert encoded all labels into the batch format
-        label_batch = train_utils.prepare_batch(self._label_data)
-
-        # convert batch format into sparse and dense tensors
-        batch_data, _ = train_utils.batch_to_session_data(self.batch_in, session_data)
-        label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
-
-        mask = batch_data["text_mask"][0]
-        a = self.combine_sparse_dense_features(
-            batch_data["text_features"],
-            mask,
-            "text",
-            sparse_dropout=self.sparse_input_dropout,
-        )
-
-        if self.masked_lm_loss:
-            a_pre, lm_mask_bool = self._mask_input(a, mask)
-        else:
-            a_pre, lm_mask_bool = (a, None)
-
-        # transformer
-        a_transformed = self._create_tf_sequence(a_pre, mask)
-
-        metrics = TrainingMetrics(loss={}, score={})
-
-        if self.masked_lm_loss:
-            loss, acc = self._train_mask_graph(a_transformed, a, lm_mask_bool)
-
-            metrics.loss["m_loss"] = loss
-            metrics.score["m_acc"] = acc
-
-        if self.intent_classification:
-            b = self.combine_sparse_dense_features(
-                batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
-            )
-            all_bs = self.combine_sparse_dense_features(
-                label_data["intent_features"], label_data["intent_mask"][0], "intent"
-            )
-
-            loss, acc = self._train_intent_graph(a_transformed, b, all_bs, mask)
-            metrics.loss["i_loss"] = loss
-            metrics.score["i_acc"] = acc
-
-        if self.named_entity_recognition:
-            c = self.combine_sparse_dense_features(batch_data["tag_ids"], mask, "tag")
-
-            loss, f1_score = self._train_entity_graph(a_transformed, c, mask)
-            metrics.loss["e_loss"] = loss
-            metrics.score["e_f1"] = f1_score
-
-        return metrics
-
-    def _train_mask_graph(self, a_transformed, a, lm_mask_bool):
-
-        # make sure there is at least one element in the mask
-        lm_mask_bool = tf.cond(
-            tf.reduce_any(lm_mask_bool),
-            lambda: lm_mask_bool,
-            lambda: tf.scatter_nd([[0, 0, 0]], [True], tf.shape(lm_mask_bool)),
-        )
-
-        lm_mask_bool = tf.squeeze(lm_mask_bool, -1)
-        a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
-        a_masked = tf.boolean_mask(a, lm_mask_bool)
-
-        a_t_masked_embed = train_utils.create_tf_embed(
-            a_t_masked, self.embed_dim, self.C2, "a_transformed", self.similarity_type
-        )
-
-        a_embed = train_utils.create_tf_embed(
-            a, self.embed_dim, self.C2, "a", self.similarity_type
-        )
-        a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
-
-        return train_utils.calculate_loss_acc(
-            a_t_masked_embed,
-            a_embed_masked,
-            a_masked,
-            a_embed,
-            a,
-            self.num_neg,
-            None,
-            self.loss_type,
-            self.mu_pos,
-            self.mu_neg,
-            self.use_max_sim_neg,
-            self.C_emb,
-            self.scale_loss,
-        )
-
-    def _train_entity_graph(
-        self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        mask_up_to_last = 1 - tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
-        sequence_lengths.set_shape([mask.shape[0]])
-
-        c = tf.reduce_sum(tf.nn.relu(c), -1)
-        c = tf.cast(c, tf.int32)
-
-        # tensor shapes
-        # a: tensor(batch-size, max-seq-len, dim)
-        # sequence_lengths: tensor(batch-size)
-        # c: (batch-size, max-seq-len)
-
-        # CRF
-        crf_params, logits, pred_ids = self._create_crf(a, sequence_lengths)
-
-        # Loss
-        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
-            logits, c, sequence_lengths, crf_params
-        )
-        loss = tf.reduce_mean(-log_likelihood)
-
-        # calculate f1 score for train predictions
-        pos_tag_indices = [k for k, v in self.inverted_tag_dict.items() if v != "O"]
-        score = f1(c, pred_ids, self.num_tags, pos_tag_indices, mask_up_to_last)
-
-        return loss, score[1]
-
-    def _train_intent_graph(
-        self, a: "tf.Tensor", b: "tf.Tensor", all_bs: "tf.Tensor", mask: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-
-        # get _cls_ vector for intent classification
-        self.cls_embed = tf.reduce_sum(a * last, 1)
-        self.cls_embed = train_utils.create_tf_embed(
-            self.cls_embed, self.embed_dim, self.C2, "cls", self.similarity_type
-        )
-
-        b = tf.reduce_sum(tf.nn.relu(b), 1)
-        all_bs = tf.reduce_sum(tf.nn.relu(all_bs), 1)
-
-        self.label_embed = self._create_tf_embed_fnn(
-            b,
-            self.hidden_layer_sizes["intent"],
-            fnn_name="text_intent" if self.share_hidden_layers else "intent",
-            embed_name="intent",
-        )
-        self.all_labels_embed = self._create_tf_embed_fnn(
-            all_bs,
-            self.hidden_layer_sizes["intent"],
-            fnn_name="text_intent" if self.share_hidden_layers else "intent",
-            embed_name="intent",
-        )
-
-        return train_utils.calculate_loss_acc(
-            self.cls_embed,
-            self.label_embed,
-            b,
-            self.all_labels_embed,
-            all_bs,
-            self.num_neg,
-            None,
-            self.loss_type,
-            self.mu_pos,
-            self.mu_neg,
-            self.use_max_sim_neg,
-            self.C_emb,
-            self.scale_loss,
-        )
-
-    def _create_crf(
-        self, input: tf.Tensor, sequence_lengths: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        with tf.variable_scope("ner", reuse=tf.AUTO_REUSE):
-            logits = train_utils.create_tf_embed(
-                input, self.num_tags, self.C2, "crf-logits"
-            )
-            crf_params = tf.get_variable(
-                "crf-params",
-                [self.num_tags, self.num_tags],
-                dtype=tf.float32,
-                regularizer=tf.contrib.layers.l2_regularizer(self.C2),
-            )
-            pred_ids, _ = tf.contrib.crf.crf_decode(
-                logits, crf_params, sequence_lengths
-            )
-
-            return crf_params, logits, pred_ids
-
     def _build_tf_pred_graph(self, session_data: "SessionDataType"):
 
         shapes, types = train_utils.get_shapes_types(session_data)
@@ -1264,75 +946,70 @@ def train(
         else:
             eval_session_data = None
 
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed
-            tf.set_random_seed(self.random_seed)
+        # set random seed
+        tf.random.set_seed(self.random_seed)
 
-            # allows increasing batch size
-            batch_size_in = tf.placeholder(tf.int64)
+        # allows increasing batch size
+        batch_size_in = self.batch_in_size[0] #* tf.ones((), tf.int32)
 
-            (
-                self._iterator,
-                train_init_op,
-                eval_init_op,
-            ) = train_utils.create_iterator_init_datasets(
-                session_data,
-                eval_session_data,
-                batch_size_in,
-                self.batch_in_strategy,
-                label_key="intent_ids",
-            )
-
-            self._is_training = tf.placeholder_with_default(False, shape=())
-
-            metrics = self._build_tf_train_graph(session_data)
-
-            # calculate overall loss
-            if self.normalize_loss:
-                loss = tf.add_n(
-                    [
-                        _loss / (tf.stop_gradient(_loss) + 1e-8)
-                        for _loss in metrics.loss.values()
-                    ]
-                )
-            else:
-                loss = tf.add_n(list(metrics.loss.values()))
+        train_dataset, eval_dataset = train_utils.create_datasets(
+            session_data,
+            eval_session_data,
+            batch_size_in,
+            self.batch_in_strategy,
+            label_key="intent_ids",
+        )
 
-            # define which optimizer to use
-            if self.optimizer.lower() == "nadam":
-                self._train_op = tf.contrib.opt.NadamOptimizer(
-                    learning_rate=self.learning_rate
-                ).minimize(loss)
-            else:
-                self._train_op = tf.train.AdamOptimizer(
-                    learning_rate=self.learning_rate
-                ).minimize(loss)
-
-            # train tensorflow graph
-            self.session = tf.Session(config=self._tf_config)
-
-            train_utils.train_tf_dataset(
-                train_init_op,
-                eval_init_op,
-                batch_size_in,
-                metrics,
-                self._train_op,
-                self.session,
-                self._is_training,
-                self.epochs,
-                self.batch_in_size,
-                self.evaluate_on_num_examples,
-                self.evaluate_every_num_epochs,
-                output_file=self.training_log_file,
-            )
+        self.model = DIET(session_data,
+                          self._label_data,
+                          self.dense_dim,
+                          self.embed_dim,
+                          self.hidden_layer_sizes,
+                          self.share_hidden_layers,
+                          self.num_transformer_layers,
+                          self.transformer_size,
+                          self.num_heads,
+                          self.pos_encoding,
+                          self.max_seq_length,
+                          self.unidirectional_encoder,
+                          self.C2,
+                          self.droprate,
+                          self.sparse_input_dropout,
+                          self.num_neg,
+                          self.loss_type,
+                          self.mu_pos,
+                          self.mu_neg,
+                          self.use_max_sim_neg,
+                          self.C_emb,
+                          self.scale_loss,
+                          self.similarity_type,
+                          self.masked_lm_loss,
+                          self.intent_classification,
+                          self.named_entity_recognition,
+                          self.inverted_tag_dict,
+                          self.learning_rate)
+
+        train_func = tf.function(self.model.train, input_signature=[train_dataset.element_spec])
+        # train_func = self.model.train
+        train_utils.train_tf_dataset(
+            train_dataset,
+            eval_dataset,
+            batch_size_in,
+            train_func,
+            self.model.loss_metric,
+            self.epochs,
+            self.batch_in_size,
+            self.evaluate_on_num_examples,
+            self.evaluate_every_num_epochs,
+            output_file=self.training_log_file,
+        )
 
-            # rebuild the graph for prediction
-            self._build_tf_pred_graph(session_data)
+        # rebuild the graph for prediction
+        self._build_tf_pred_graph(session_data)
 
-            self.attention_weights = train_utils.extract_attention(
-                self.attention_weights
-            )
+        self.attention_weights = train_utils.extract_attention(
+            self.attention_weights
+        )
 
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
@@ -1493,3 +1170,441 @@ def load(
                 "doesn't exist."
             )
             return cls(component_config=meta)
+
+
+class DIET(tf.Module):
+
+    @staticmethod
+    def _create_sparse_dense_layer(values, name, C2, dense_dim):
+
+        input_dim = None
+        for v in values:
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                input_dim = v[0].shape[-1]
+            else:
+                # if dense features are present
+                # use the feature dimension of the dense features
+                dense_dim = v[0].shape[-1]
+
+        if input_dim:
+            return tf_layers.DenseForSparse(input_dim=input_dim,
+                                            units=dense_dim,
+                                            C2=C2,
+                                            name=name)
+
+    @staticmethod
+    def _input_dim(values, dense_dim):
+
+        for v in values:
+            if not isinstance(v[0], scipy.sparse.spmatrix):
+                # if dense features are present
+                # use the feature dimension of the dense features
+                dense_dim = v[0].shape[-1]
+                break
+
+        return dense_dim * len(values)
+
+    @staticmethod
+    def _get_layers(layers: Dict):
+        return [layer for layer in layers.values() if layer is not None]
+
+    def __init__(self,
+                 session_data,
+                 label_data,
+                 dense_dim,
+                 embed_dim,
+                 hidden_layer_sizes,
+                 share_hidden_layers,
+                 num_transformer_layers,
+                 transformer_size,
+                 num_heads,
+                 pos_encoding,
+                 max_seq_length,
+                 unidirectional_encoder,
+                 C2,
+                 droprate,
+                 sparse_input_dropout,
+                 num_neg,
+                 loss_type,
+                 mu_pos,
+                 mu_neg,
+                 use_max_sim_neg,
+                 C_emb,
+                 scale_loss,
+                 similarity_type,
+                 masked_lm_loss,
+                 intent_classification,
+                 named_entity_recognition,
+                 inverted_tag_dict,
+                 learning_rate):
+        super(DIET, self).__init__(name="DIET")
+
+        # data
+        self.session_data = session_data
+        label_batch = train_utils.prepare_batch(label_data)
+        self.tf_label_data, _ = train_utils.batch_to_session_data(label_batch, label_data)
+
+        # options
+        self._sparse_input_dropout = sparse_input_dropout
+        # self._hparams = train_utils.create_t2t_hparams(
+        #     num_transformer_layers,
+        #     transformer_size,
+        #     num_heads,
+        #     droprate,
+        #     pos_encoding,
+        #     max_seq_length,
+        #     unidirectional_encoder,
+        # )
+        self._num_neg = num_neg
+        self._loss_type = loss_type
+        self._mu_pos = mu_pos
+        self._mu_neg = mu_neg
+        self._use_max_sim_neg = use_max_sim_neg
+        self._C_emb = C_emb
+        self._scale_loss = scale_loss
+        self._masked_lm_loss = masked_lm_loss
+        self._intent_classification = intent_classification
+        self._named_entity_recognition = named_entity_recognition
+        self._inverted_tag_dict = inverted_tag_dict
+        self._num_tags = len(inverted_tag_dict)
+        print(inverted_tag_dict)
+        exit()
+
+        # tf objects
+        self._layers = []
+
+        self._sparse_dropout = tf_layers.SparseDropout(rate=droprate)
+        self._sparse_to_dense = {
+            "text": self._create_sparse_dense_layer(session_data["text_features"],
+                                                    "text",
+                                                    C2,
+                                                    dense_dim),
+            "intent": self._create_sparse_dense_layer(session_data["intent_features"],
+                                                      "intent",
+                                                      C2,
+                                                      dense_dim),
+        }
+        self._layers.extend(self._get_layers(self._sparse_to_dense))
+
+        text_input_dim = self._input_dim(session_data["text_features"], dense_dim)
+        intent_input_dim = self._input_dim(session_data["intent_features"], dense_dim)
+
+        self._ffnn = {
+            "text": tf_layers.Ffnn(text_input_dim,
+                                   hidden_layer_sizes["text"],
+                                   droprate,
+                                   C2,
+                                   "text_intent" if share_hidden_layers else "text"),
+            "intent": tf_layers.Ffnn(intent_input_dim,
+                                     hidden_layer_sizes["intent"],
+                                     droprate,
+                                     C2,
+                                     "text_intent" if share_hidden_layers else "intent")
+        }
+        self._layers.extend(self._get_layers(self._ffnn))
+
+        # noinspection PyUnresolvedReferences
+        if num_transformer_layers > 0:
+            self._transformer = tf_layers.TransformerEncoder(
+                num_transformer_layers,
+                transformer_size,
+                num_heads,
+                transformer_size * 4,
+                self._ffnn["text"].output_dim,
+                256,
+                droprate
+            )
+            self._layers.append(self._transformer)
+        else:
+            self._transformer = lambda x, mask, training: x
+
+        self._embed = {}
+        if self._masked_lm_loss:
+            self._embed["text_mask"] = tf_layers.Embed(transformer_size,
+                                                       embed_dim,
+                                                       C2,
+                                                       "text_mask",
+                                                       similarity_type)
+            self._embed["text_token"] = tf_layers.Embed(text_input_dim,
+                                                        embed_dim,
+                                                        C2,
+                                                        "text_token",
+                                                        similarity_type)
+        if self._intent_classification:
+            self._embed["text"] = tf_layers.Embed(transformer_size,
+                                                  embed_dim,
+                                                  C2,
+                                                  "text",
+                                                  similarity_type)
+            self._embed["intent"] = tf_layers.Embed(self._ffnn["intent"].output_dim,
+                                                    embed_dim,
+                                                    C2,
+                                                    "intent",
+                                                    similarity_type)
+        if self._named_entity_recognition:
+            self._embed["logits"] = tf_layers.Embed(transformer_size,
+                                                    self._num_tags,
+                                                    C2,
+                                                    "logits")
+        self._layers.extend(self._get_layers(self._embed))
+
+        # tf tensors
+        self.training = tf.ones((), tf.bool)
+        initializer = tf.keras.initializers.GlorotUniform()
+        self._mask_vector = tf.Variable(
+            initial_value=initializer((1, 1, text_input_dim)),
+            trainable=True,
+            name="mask_vector"
+        )
+        self._crf_params = tf.Variable(
+            initial_value=initializer((self._num_tags, self._num_tags)),
+            trainable=True,
+            name="crf_params"
+        )
+        self.all_labels = None
+        self.all_labels_embed = None
+        self.attention_weights = {}
+
+        # tf training
+        self._optimizer = tf.keras.optimizers.Adam(learning_rate)
+        self.loss_metric = tf.keras.metrics.Mean(name='t_loss')
+        self.intent_acc_metric = tf.keras.metrics.Mean(name='i_acc')
+        self.entity_f1_metric = tfa.metrics.F1Score(num_classes=self._num_tags, average='micro', name="e_f1")
+
+    def _combine_sparse_dense_features(
+            self,
+            features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+            mask: "tf.Tensor",
+            name: Text,
+            sparse_dropout: bool = False,
+    ) -> "tf.Tensor":
+
+        dense_features = []
+
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                if sparse_dropout:
+                    _f = self._sparse_dropout(f, self.training)
+                else:
+                    _f = f
+
+                dense_features.append(
+                    self._sparse_to_dense[name](_f)
+                )
+            else:
+                dense_features.append(f)
+
+        return tf.concat(dense_features, axis=-1) * mask
+
+    def _create_bow(self,
+                    features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+                    mask: "tf.Tensor",
+                    name: Text):
+
+        x = self._combine_sparse_dense_features(features, mask, name)
+        return self._ffnn[name](tf.reduce_sum(x, 1), self.training)
+
+    def _mask_input(
+            self, a: "tf.Tensor", mask: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Randomly mask input sequences."""
+
+        # do not substitute with cls token
+        pad_mask_up_to_last = tf.math.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        mask_up_to_last = 1 - pad_mask_up_to_last
+
+        a_random_pad = (
+                tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
+                * pad_mask_up_to_last
+        )
+        a_shuffle = tf.stop_gradient(
+            tf.random.shuffle(a * mask_up_to_last + a_random_pad)
+        )
+
+        a_mask = tf.tile(self._mask_vector, (tf.shape(a)[0], tf.shape(a)[1], 1))
+
+        other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
+        other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
+        a_other = tf.where(
+            other_prob < 0.70, a_mask, tf.where(other_prob < 0.80, a_shuffle, a)
+        )
+
+        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
+        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
+        a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
+
+        a_pre = tf.cond(self.training, lambda: a_pre, lambda: a)
+
+        return a_pre, lm_mask_bool
+
+    def _create_sequence(self,
+                         features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+                         mask: "tf.Tensor",
+                         name: Text,
+                         masked_lm_loss: bool):
+        x = self._combine_sparse_dense_features(
+            features,
+            mask,
+            name,
+            sparse_dropout=self._sparse_input_dropout,
+        )
+
+        if masked_lm_loss:
+            pre, lm_mask_bool = self._mask_input(x, mask)
+        else:
+            pre, lm_mask_bool = (x, None)
+
+        transformed = self._transformer(pre, mask, self.training)
+        # transformed = train_utils.create_t2t_transformer_encoder(
+        #     self._ffnn[name](pre),
+        #     self._pre_transformer,
+        #     mask,
+        #     self.attention_weights,
+        #     self._hparams,
+        #     name,
+        # )
+
+        return transformed, x, lm_mask_bool
+
+    def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
+        # make sure there is at least one element in the mask
+        lm_mask_bool = tf.cond(
+            tf.reduce_any(lm_mask_bool),
+            lambda: lm_mask_bool,
+            lambda: tf.scatter_nd([[0, 0, 0]], [True], tf.shape(lm_mask_bool)),
+        )
+
+        lm_mask_bool = tf.squeeze(lm_mask_bool, -1)
+        a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
+        a_masked = tf.boolean_mask(a, lm_mask_bool)
+
+        a_t_masked_embed = self._embed[f"{name}_mask"](a_t_masked)
+        a_embed = self._embed[f"{name}_token"](a)
+
+        a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
+
+        return train_utils.calculate_loss_acc(
+            a_t_masked_embed,
+            a_embed_masked,
+            a_masked,
+            a_embed,
+            a,
+            self._num_neg,
+            None,
+            self._loss_type,
+            self._mu_pos,
+            self._mu_neg,
+            self._use_max_sim_neg,
+            self._C_emb,
+            self._scale_loss,
+        )
+
+    def _intent_loss(self, a, b):
+
+        a_embed = self._embed["text"](a)
+        b_embed = self._embed["intent"](b)
+
+        return train_utils.calculate_loss_acc(
+            a_embed,
+            b_embed,
+            b,
+            self.all_labels_embed,
+            self.all_labels,
+            self._num_neg,
+            None,
+            self._loss_type,
+            self._mu_pos,
+            self._mu_neg,
+            self._use_max_sim_neg,
+            self._C_emb,
+            self._scale_loss,
+        )
+
+    def _entity_loss(
+        self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+
+        mask_up_to_last = 1 - tf.math.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
+        sequence_lengths.set_shape([mask.shape[0]])
+
+        c = tf.cast(c[:, :, 0], tf.int32)
+        logits = self._embed["logits"](a)
+
+        # tensor shapes
+        # a: tensor(batch-size, max-seq-len, dim)
+        # sequence_lengths: tensor(batch-size)
+        # c: (batch-size, max-seq-len)
+
+        # CRF Loss
+        log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
+            logits, c, sequence_lengths, self._crf_params
+        )
+        tf.print("ll", tf.reduce_max(log_likelihood))
+        loss = tf.reduce_mean(-log_likelihood)
+        tf.print("loss", loss)
+
+        # CRF preds
+        pred_ids, _ = tfa.text.crf.crf_decode(logits, self._crf_params, sequence_lengths)
+
+        # calculate f1 score for train predictions
+        score = self.entity_f1_metric(c, pred_ids)
+
+        return loss, score
+
+    def _build_all_b(self):
+        if self._intent_classification:
+            self.all_labels = self._create_bow(
+                self.tf_label_data["intent_features"], self.tf_label_data["intent_mask"][0], "intent"
+            )
+            self.all_labels_embed = self._embed["intent"](self.all_labels)
+
+    def _create_metrics(self, batch_in):
+        tf_batch_data, _ = train_utils.batch_to_session_data(batch_in, self.session_data)
+
+        mask_text = tf_batch_data["text_mask"][0]
+        text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss)
+
+        metrics = TrainingMetrics(loss={}, score={})
+
+        # if self._masked_lm_loss:
+        #     loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
+        #
+        #     metrics.loss["m_loss"] = loss
+        #     metrics.score["m_acc"] = acc
+
+        # if self._intent_classification:
+        #     last_text = mask_text * tf.math.cumprod(1 - mask_text, axis=1, exclusive=True, reverse=True)
+        #     # get _cls_ vector for intent classification
+        #     cls = tf.reduce_sum(text_transformed * last_text, 1)
+        #     label = self._create_bow(
+        #         tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
+        #     )
+        #     loss, acc = self._intent_loss(cls, label)
+        #
+        #     metrics.loss["i_loss"] = loss
+        #     metrics.score["i_acc"] = acc
+
+        if self._named_entity_recognition:
+            tags = tf_batch_data["tag_ids"][0]
+
+            loss, f1_score = self._entity_loss(text_transformed, tags, mask_text)
+            metrics.loss["e_loss"] = loss
+            metrics.score["e_f1"] = f1_score
+
+        return metrics
+
+    def train(self, batch_in):
+
+        with tf.GradientTape() as tape:
+            self._build_all_b()
+            metrics = self._create_metrics(batch_in)
+            reg_losses = tf.math.add_n([tf.math.add_n(layer.losses) for layer in self._layers if layer.losses])
+            # total_loss = reg_losses
+            total_loss = tf.math.add_n(list(metrics.loss.values()))# + reg_losses
+
+        gradients = tape.gradient(total_loss, self.trainable_variables)
+        self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+
+        self.loss_metric.update_state(total_loss)
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 5643ecbedfcc..39218cc48760 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -16,11 +16,6 @@
 if typing.TYPE_CHECKING:
     from rasa.nlu.training_data import Message
 
-import tensorflow as tf
-
-# avoid warning println on contrib import - remove for tf 2
-tf.contrib._warning = None
-
 
 class ResponseSelector(EmbeddingIntentClassifier):
     """Response selector using supervised embeddings.
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
new file mode 100644
index 000000000000..99c89875840f
--- /dev/null
+++ b/rasa/utils/tf_layers.py
@@ -0,0 +1,340 @@
+import logging
+import typing
+from typing import (
+    List,
+    Optional,
+    Text,
+    Dict,
+    Tuple,
+    Union,
+    Generator,
+    Callable,
+    Any,
+    NamedTuple,
+)
+import tensorflow as tf
+import numpy as np
+
+if typing.TYPE_CHECKING:
+    from tensor2tensor.utils.hparam import HParams
+
+logger = logging.getLogger(__name__)
+
+
+class SparseDropout(tf.keras.layers.Dropout):
+
+    def call(self, inputs, training):
+        if training is None:
+            training = tf.keras.backend.learning_phase()
+
+        to_retain_prob = tf.random.uniform(
+            tf.shape(inputs.values), 0, 1, inputs.values.dtype
+        )
+        to_retain = tf.greater_equal(to_retain_prob, self.rate)
+        dropped_inputs = tf.sparse.retain(inputs, to_retain)
+        outputs = tf.cond(training, lambda: dropped_inputs, lambda: inputs)
+        outputs._dense_shape = inputs._dense_shape
+
+        return outputs
+
+
+class DenseForSparse(tf.keras.layers.Dense):
+    """Dense layer for sparse input tensor"""
+
+    # noinspection PyPep8Naming
+    def __init__(self,
+                 C2: float,
+                 activation: Optional[Callable] = tf.nn.relu,
+                 **kwargs):
+        kernel_regularizer = tf.keras.regularizers.l1(C2)
+
+        super(DenseForSparse, self).__init__(kernel_regularizer=kernel_regularizer,
+                                             activation=activation,
+                                             **kwargs)
+
+    def call(self, inputs):
+        if not isinstance(inputs, tf.SparseTensor):
+            raise ValueError("Input tensor should be sparse.")
+
+        # outputs will be 2D
+        outputs = tf.sparse.sparse_dense_matmul(tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), self.kernel)
+
+        if len(inputs.shape) == 3:
+            # reshape back
+            outputs = tf.reshape(
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+            )
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(outputs, self.bias)
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
+
+class Ffnn(tf.keras.layers.Layer):
+    """Create feed-forward nn with hidden layers and name suffix."""
+
+    # noinspection PyPep8Naming
+    def __init__(
+        self,
+        input_dim: int,
+        layer_sizes: List[int],
+        droprate: float,
+        C2: float,
+        layer_name_suffix: Text,
+        activation: Optional[Callable] = tf.nn.relu,
+        use_bias: bool = True,
+        kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
+    ):
+        super(Ffnn, self).__init__(name=f"ffnn_{layer_name_suffix}")
+
+        self._layers = []
+        for i, layer_size in enumerate(layer_sizes):
+            self._layers.append(tf.keras.layers.Dense(
+                units=layer_size,
+                input_dim=input_dim,
+                activation=activation,
+                use_bias=use_bias,
+                kernel_initializer=kernel_initializer,
+                kernel_regularizer=tf.keras.regularizers.l2(C2),
+                name=f"hidden_layer_{layer_name_suffix}_{i}",
+            ))
+            input_dim = layer_size
+            self._layers.append(tf.keras.layers.Dropout(rate=droprate))
+
+        self.output_dim = input_dim
+
+    def call(self, inputs, training):
+        x = inputs
+        for layer in self._layers:
+            x = layer(x, training=training)
+
+        return x
+
+
+class Embed(tf.keras.layers.Layer):
+    """Create dense embedding layer with a name."""
+
+    # noinspection PyPep8Naming
+    def __init__(
+            self,
+            input_dim: int,
+            embed_dim: int,
+            C2: float,
+            layer_name_suffix: Text,
+            similarity_type: Optional[Text] = None,
+    ):
+        super(Embed, self).__init__(name=f"embed_{layer_name_suffix}")
+
+        self.similarity_type = similarity_type
+        if self.similarity_type and self.similarity_type not in {"cosine", "inner"}:
+            raise ValueError(
+                f"Wrong similarity type '{self.similarity_type}', "
+                f"should be 'cosine' or 'inner'"
+            )
+
+        self._layers = [tf.keras.layers.Dense(
+            units=embed_dim,
+            input_dim=input_dim,
+            activation=None,
+            kernel_regularizer=tf.keras.regularizers.l2(C2),
+            name=f"embed_layer_{layer_name_suffix}",
+        )]
+
+    def call(self, inputs):
+        x = inputs
+        for layer in self._layers:
+            x = layer(x)
+        if self.similarity_type == "cosine":
+            x = tf.nn.l2_normalize(x, -1)
+
+        return x
+
+
+# from https://www.tensorflow.org/tutorials/text/transformer
+# TODO add weight regularization (L1)
+# TODO collect losses
+class MultiHeadAttention(tf.keras.layers.Layer):
+
+    @staticmethod
+    def _scaled_dot_product_attention(q, k, v, mask):
+        """Calculate the attention weights.
+        q, k, v must have matching leading dimensions.
+        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
+        The mask has different shapes depending on its type(padding or look ahead)
+        but it must be broadcastable for addition.
+
+        Args:
+          q: query shape == (..., seq_len_q, depth)
+          k: key shape == (..., seq_len_k, depth)
+          v: value shape == (..., seq_len_v, depth_v)
+          mask: Float tensor with shape broadcastable
+                to (..., seq_len_q, seq_len_k). Defaults to None.
+
+        Returns:
+          output, attention_weights
+        """
+
+        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
+
+        # scale matmul_qk
+        dk = tf.cast(tf.shape(k)[-1], tf.float32)
+        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+
+        # add the mask to the scaled tensor.
+        if mask is not None:
+            scaled_attention_logits += (mask * -1e9)
+
+            # softmax is normalized on the last axis (seq_len_k) so that the scores
+        # add up to 1.
+        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
+
+        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
+
+        return output, attention_weights
+
+    def __init__(self, d_model, num_heads):
+        super(MultiHeadAttention, self).__init__()
+        self.num_heads = num_heads
+        self.d_model = d_model
+
+        assert d_model % self.num_heads == 0
+
+        self.depth = d_model // self.num_heads
+
+        self.wq = tf.keras.layers.Dense(d_model)
+        self.wk = tf.keras.layers.Dense(d_model)
+        self.wv = tf.keras.layers.Dense(d_model)
+
+        self.dense = tf.keras.layers.Dense(d_model)
+
+    def split_heads(self, x, batch_size):
+        """Split the last dimension into (num_heads, depth).
+        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
+        """
+        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def call(self, v, k, q, mask):
+        batch_size = tf.shape(q)[0]
+
+        q = self.wq(q)  # (batch_size, seq_len, d_model)
+        k = self.wk(k)  # (batch_size, seq_len, d_model)
+        v = self.wv(v)  # (batch_size, seq_len, d_model)
+
+        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
+        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
+        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
+
+        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
+        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
+        scaled_attention, attention_weights = self._scaled_dot_product_attention(
+            q, k, v, mask)
+
+        scaled_attention = tf.transpose(scaled_attention,
+                                        perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
+
+        concat_attention = tf.reshape(scaled_attention,
+                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
+
+        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
+
+        return output, attention_weights
+
+
+# TODO add weight regularization (L2)
+# TODO collect losses
+class TransformerEncoderLayer(tf.keras.layers.Layer):
+    def __init__(self, d_model, num_heads, dff, rate=0.1):
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.mha = MultiHeadAttention(d_model, num_heads)
+        self.ffn = tf.keras.Sequential([
+            tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
+            tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
+        ])
+
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+
+        self.dropout1 = tf.keras.layers.Dropout(rate)
+        self.dropout2 = tf.keras.layers.Dropout(rate)
+
+    def call(self, x, mask, training):
+        # mask is (batch_size, 1, 1, seq_len)
+        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
+
+        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
+
+        return out2
+
+
+def create_look_ahead_mask(size):
+    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
+    return mask  # (seq_len, seq_len)
+
+
+# TODO collect losses
+class TransformerEncoder(tf.keras.layers.Layer):
+
+    @staticmethod
+    def _get_angles(pos, i, d_model):
+        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+        return pos * angle_rates
+
+    @classmethod
+    def _positional_encoding(cls, position, d_model):
+        angle_rads = cls._get_angles(np.arange(position)[:, np.newaxis],
+                                     np.arange(d_model)[np.newaxis, :],
+                                     d_model)
+
+        # apply sin to even indices in the array; 2i
+        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+
+        # apply cos to odd indices in the array; 2i+1
+        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+
+        pos_encoding = angle_rads[np.newaxis, ...]
+
+        return tf.cast(pos_encoding, dtype=tf.float32)
+
+    def __init__(self, num_layers, d_model, num_heads, dff, input_dim,
+                 maximum_position_encoding, rate=0.1):
+        super(TransformerEncoder, self).__init__()
+
+        self.d_model = d_model
+        self.num_layers = num_layers
+
+        # TODO use Embed
+        self.embedding = tf.keras.layers.Dense(input_dim=input_dim, units=d_model)
+        self.pos_encoding = self._positional_encoding(maximum_position_encoding,
+                                                      self.d_model)
+
+        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate)
+                           for _ in range(num_layers)]
+
+        self.dropout = tf.keras.layers.Dropout(rate)
+
+    def call(self, x, mask, training):
+
+        seq_len = tf.shape(x)[1]
+        mask = tf.squeeze(mask, -1)
+        mask = mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
+
+        # adding embedding and position encoding.
+        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
+        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
+        x += self.pos_encoding[:, :seq_len, :]
+
+        x = self.dropout(x, training=training)
+
+        for i in range(self.num_layers):
+            # mask is (batch_size, 1, 1, seq_len)
+            x = self.enc_layers[i](x, mask, training)
+
+        return x  # (batch_size, input_seq_len, d_model)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 265d62bc63c9..653cf9d9fb12 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -18,21 +18,18 @@
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
-from tensor2tensor.models.transformer import (
-    transformer_base,
-    transformer_prepare_encoder,
-    transformer_encoder,
-)
-from tensor2tensor.layers.common_attention import large_compatible_negative
+# from tensor2tensor.models.transformer import (
+#     transformer_base,
+#     transformer_prepare_encoder,
+#     transformer_encoder,
+# )
+# from tensor2tensor.layers.common_attention import large_compatible_negative
 from rasa.utils.common import is_logging_disabled
 
 
 if typing.TYPE_CHECKING:
     from tensor2tensor.utils.hparam import HParams
 
-# avoid warning println on contrib import - remove for tf 2
-tf.contrib._warning = None
-
 logger = logging.getLogger(__name__)
 
 
@@ -517,13 +514,13 @@ def append_type(v: np.ndarray):
     return tuple(shapes), tuple(types)
 
 
-def create_iterator_init_datasets(
+def create_datasets(
     session_data: SessionDataType,
     eval_session_data: SessionDataType,
     batch_size: Union["tf.Tensor", int],
     batch_strategy: Text,
     label_key: Text,
-) -> Tuple["tf.data.Iterator", "tf.Operation", "tf.Operation"]:
+) -> Tuple["tf.data.Dataset", "tf.data.Dataset"]:
     """Create iterator and init datasets."""
 
     train_dataset = create_tf_dataset(
@@ -534,237 +531,100 @@ def create_iterator_init_datasets(
         shuffle=True,
     )
 
-    iterator = tf.data.Iterator.from_structure(
-        train_dataset.output_types, train_dataset.output_shapes
-    )
-
-    train_init_op = iterator.make_initializer(train_dataset)
-
     if eval_session_data is not None:
-        eval_init_op = iterator.make_initializer(
-            create_tf_dataset(eval_session_data, batch_size, label_key=label_key)
-        )
+        eval_dataset = create_tf_dataset(eval_session_data, batch_size, label_key=label_key)
     else:
-        eval_init_op = None
-
-    return iterator, train_init_op, eval_init_op
-
-
-# noinspection PyPep8Naming
-def tf_dense_layer_for_sparse(
-    inputs: tf.SparseTensor,
-    units: int,
-    name: Text,
-    C2: float,
-    activation: Optional[Callable] = tf.nn.relu,
-    use_bias: bool = True,
-    input_dim: Optional[int] = None,
-) -> tf.Tensor:
-    """Dense layer for sparse input tensor"""
-
-    if not isinstance(inputs, tf.SparseTensor):
-        raise ValueError("Input tensor should be sparse.")
-
-    with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
-        kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
-        input_dim = input_dim or int(inputs.shape[-1])
-        kernel = tf.get_variable(
-            "kernel",
-            shape=[input_dim, units],
-            dtype=inputs.dtype,
-            regularizer=kernel_regularizer,
-        )
-        bias = tf.get_variable("bias", shape=[units], dtype=inputs.dtype)
-
-        # outputs will be 2D
-        outputs = tf.sparse.matmul(tf.sparse.reshape(inputs, [-1, input_dim]), kernel)
-
-        if len(inputs.shape) == 3:
-            # reshape back
-            outputs = tf.reshape(
-                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
-            )
-
-        if use_bias:
-            outputs = tf.nn.bias_add(outputs, bias)
-
-    if activation is None:
-        return outputs
-
-    return activation(outputs)
-
-
-# noinspection PyPep8Naming
-def create_tf_fnn(
-    x_in: "tf.Tensor",
-    layer_sizes: List[int],
-    droprate: float,
-    C2: float,
-    is_training: "tf.Tensor",
-    layer_name_suffix: Text,
-    activation: Optional[Callable] = tf.nn.relu,
-    use_bias: bool = True,
-    kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
-) -> "tf.Tensor":
-    """Create nn with hidden layers and name suffix."""
-
-    reg = tf.contrib.layers.l2_regularizer(C2)
-    x = tf.nn.relu(x_in)
-    for i, layer_size in enumerate(layer_sizes):
-        x = tf.layers.dense(
-            inputs=x,
-            units=layer_size,
-            activation=activation,
-            use_bias=use_bias,
-            kernel_initializer=kernel_initializer,
-            kernel_regularizer=reg,
-            name=f"hidden_layer_{layer_name_suffix}_{i}",
-            reuse=tf.AUTO_REUSE,
-        )
-        x = tf.layers.dropout(x, rate=droprate, training=is_training)
-    return x
-
-
-def tf_normalize_if_cosine(x: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
-    """Normalize embedding if similarity type is cosine."""
-
-    if similarity_type == "cosine":
-        return tf.nn.l2_normalize(x, -1)
-    elif similarity_type == "inner":
-        return x
-    else:
-        raise ValueError(
-            f"Wrong similarity type '{similarity_type}', "
-            f"should be 'cosine' or 'inner'"
-        )
-
-
-# noinspection PyPep8Naming
-def create_tf_embed(
-    x: "tf.Tensor",
-    embed_dim: int,
-    C2: float,
-    layer_name_suffix: Text,
-    similarity_type: Optional[Text] = None,
-) -> "tf.Tensor":
-    """Create dense embedding layer with a name."""
-
-    reg = tf.contrib.layers.l2_regularizer(C2)
-    embed_x = tf.layers.dense(
-        inputs=x,
-        units=embed_dim,
-        activation=None,
-        kernel_regularizer=reg,
-        name=f"embed_layer_{layer_name_suffix}",
-        reuse=tf.AUTO_REUSE,
-    )
-
-    if similarity_type:
-        # normalize embedding vectors for cosine similarity
-        return tf_normalize_if_cosine(embed_x, similarity_type)
-
-    return embed_x
-
-
-def create_t2t_hparams(
-    num_transformer_layers: int,
-    transformer_size: int,
-    num_heads: int,
-    droprate: float,
-    pos_encoding: Text,
-    max_seq_length: int,
-    is_training: "tf.Tensor",
-    unidirectional_encoder: bool = True,
-) -> "HParams":
-    """Create parameters for t2t transformer."""
-
-    hparams = transformer_base()
-
-    hparams.num_hidden_layers = num_transformer_layers
-    hparams.hidden_size = transformer_size
-    # it seems to be factor of 4 for transformer architectures in t2t
-    hparams.filter_size = hparams.hidden_size * 4
-    hparams.num_heads = num_heads
-    hparams.relu_dropout = droprate
-    hparams.pos = pos_encoding
-
-    hparams.max_length = max_seq_length
-
-    hparams.unidirectional_encoder = unidirectional_encoder
-
-    hparams.self_attention_type = "dot_product_relative_v2"
-    hparams.max_relative_position = 5
-    hparams.add_relative_to_values = True
-
-    # When not in training mode, set all forms of dropout to zero.
-    for key, value in hparams.values().items():
-        if key.endswith("dropout") or key == "label_smoothing":
-            setattr(hparams, key, value * tf.cast(is_training, tf.float32))
-
-    return hparams
-
-
-# noinspection PyUnresolvedReferences
-# noinspection PyPep8Naming
-def create_t2t_transformer_encoder(
-    x_in: "tf.Tensor",
-    mask: "tf.Tensor",
-    attention_weights: Dict[Text, "tf.Tensor"],
-    hparams: "HParams",
-    C2: float,
-    is_training: "tf.Tensor",
-) -> "tf.Tensor":
-    """Create t2t transformer encoder."""
-    with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
-        if len(mask.shape) == 2:
-            _mask = tf.expand_dims(mask, -1)
-        else:
-            _mask = mask
-
-        x = create_tf_fnn(
-            x_in,
-            [hparams.hidden_size],
-            hparams.layer_prepostprocess_dropout,
-            C2,
-            is_training,
-            layer_name_suffix="pre_embed",
-            activation=None,
-            use_bias=False,
-            kernel_initializer=tf.random_normal_initializer(
-                0.0, hparams.hidden_size ** -0.5
-            ),
-        )
-        if hparams.multiply_embedding_mode == "sqrt_depth":
-            x *= hparams.hidden_size ** 0.5
-
-        x *= _mask
-        (
-            x,
-            self_attention_bias,
-            encoder_decoder_attention_bias,
-        ) = transformer_prepare_encoder(x, None, hparams)
-
-        x *= _mask
-
-        x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
-
-        attn_bias_for_padding = None
-        # Otherwise the encoder will just use encoder_self_attention_bias.
-        if hparams.unidirectional_encoder:
-            attn_bias_for_padding = encoder_decoder_attention_bias
-
-        x = transformer_encoder(
-            x,
-            self_attention_bias,
-            hparams,
-            nonpadding=_mask,
-            save_weights_to=attention_weights,
-            attn_bias_for_padding=attn_bias_for_padding,
-        )
-
-        x *= _mask
-
-        return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
+        eval_dataset = None
+
+    return train_dataset, eval_dataset
+
+
+# def create_t2t_hparams(
+#     num_transformer_layers: int,
+#     transformer_size: int,
+#     num_heads: int,
+#     droprate: float,
+#     pos_encoding: Text,
+#     max_seq_length: int,
+#     unidirectional_encoder: bool = True,
+# ) -> "HParams":
+#     """Create parameters for t2t transformer."""
+#
+#     hparams = transformer_base()
+#
+#     hparams.num_hidden_layers = num_transformer_layers
+#     hparams.hidden_size = transformer_size
+#     # it seems to be factor of 4 for transformer architectures in t2t
+#     hparams.filter_size = hparams.hidden_size * 4
+#     hparams.num_heads = num_heads
+#     hparams.relu_dropout = droprate
+#     hparams.pos = pos_encoding
+#
+#     hparams.max_length = max_seq_length
+#
+#     hparams.unidirectional_encoder = unidirectional_encoder
+#
+#     hparams.self_attention_type = "dot_product_relative_v2"
+#     hparams.max_relative_position = 5
+#     hparams.add_relative_to_values = True
+#
+#     # When not in training mode, set all forms of dropout to zero.
+#     training = tf.keras.backend.learning_phase()
+#     for key, value in hparams.values().items():
+#         if key.endswith("dropout") or key == "label_smoothing":
+#             setattr(hparams, key, value * tf.cast(training, tf.float32))
+#
+#     return hparams
+#
+#
+# # noinspection PyUnresolvedReferences
+# def create_t2t_transformer_encoder(
+#     x_in: "tf.Tensor",
+#     pre_transformer: "tf.keras.layers.Layer",
+#     mask: "tf.Tensor",
+#     attention_weights: Dict[Text, "tf.Tensor"],
+#     hparams: "HParams",
+#     name: Text,
+# ) -> "tf.Tensor":
+#     """Create t2t transformer encoder."""
+#     with tf.variable_scope(f"transformer_{name}", reuse=tf.AUTO_REUSE):
+#         if len(mask.shape) == 2:
+#             _mask = tf.expand_dims(mask, -1)
+#         else:
+#             _mask = mask
+#
+#         x = pre_transformer(x_in)
+#
+#         if hparams.multiply_embedding_mode == "sqrt_depth":
+#             x *= hparams.hidden_size ** 0.5
+#
+#         (
+#             x,
+#             self_attention_bias,
+#             encoder_decoder_attention_bias,
+#         ) = transformer_prepare_encoder(x, None, hparams)
+#
+#         x *= _mask
+#
+#         x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+#
+#         attn_bias_for_padding = None
+#         # Otherwise the encoder will just use encoder_self_attention_bias.
+#         if hparams.unidirectional_encoder:
+#             attn_bias_for_padding = encoder_decoder_attention_bias
+#
+#         x = transformer_encoder(
+#             x,
+#             self_attention_bias,
+#             hparams,
+#             nonpadding=_mask,
+#             save_weights_to=attention_weights,
+#             attn_bias_for_padding=attn_bias_for_padding,
+#         )
+#
+#         x *= _mask
+#
+#         return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
 
 
 def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":
@@ -780,7 +640,7 @@ def _tf_sample_neg(
 
     tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
 
-    return tf.batch_gather(tiled_all_bs, neg_ids)
+    return tf.gather(tiled_all_bs, neg_ids, batch_dims=-1)
 
 
 def _tf_get_bad_mask(
@@ -838,7 +698,7 @@ def _tf_get_negs(
     return neg_embed, bad_negs
 
 
-def sample_negatives(
+def _sample_negatives(
     a_embed: "tf.Tensor",
     b_embed: "tf.Tensor",
     b_raw: "tf.Tensor",
@@ -879,7 +739,7 @@ def tf_raw_sim(
     return sim
 
 
-def tf_sim(
+def _tf_sim(
     pos_dial_embed: "tf.Tensor",
     pos_bot_embed: "tf.Tensor",
     neg_dial_embed: "tf.Tensor",
@@ -892,7 +752,7 @@ def tf_sim(
 
     # calculate similarity with several
     # embedded actions for the loss
-    neg_inf = large_compatible_negative(pos_dial_embed.dtype)
+    neg_inf = -1e9  # large_compatible_negative(pos_dial_embed.dtype)
 
     sim_pos = tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
     sim_neg = tf_raw_sim(pos_dial_embed, neg_bot_embed, mask) + neg_inf * bot_bad_negs
@@ -911,7 +771,7 @@ def tf_sim(
     return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
 
 
-def tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
+def _tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
     """Calculate accuracy"""
 
     max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
@@ -921,7 +781,7 @@ def tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
 
 
 # noinspection PyPep8Naming
-def tf_loss_margin(
+def _tf_loss_margin(
     sim_pos: "tf.Tensor",
     sim_neg: "tf.Tensor",
     sim_neg_bot_bot: "tf.Tensor",
@@ -969,13 +829,10 @@ def tf_loss_margin(
     # average the loss over the batch
     loss = tf.reduce_mean(loss)
 
-    # add regularization losses
-    loss += tf.losses.get_regularization_loss()
-
     return loss
 
 
-def tf_loss_softmax(
+def _tf_loss_softmax(
     sim_pos: "tf.Tensor",
     sim_neg: "tf.Tensor",
     sim_neg_bot_bot: "tf.Tensor",
@@ -990,14 +847,8 @@ def tf_loss_softmax(
         [sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial], -1
     )
 
-    # create labels for softmax
-    if len(logits.shape) == 3:
-        pos_labels = tf.ones_like(logits[:, :, :1])
-        neg_labels = tf.zeros_like(logits[:, :, 1:])
-    else:  # len(logits.shape) == 2
-        pos_labels = tf.ones_like(logits[:, :1])
-        neg_labels = tf.zeros_like(logits[:, 1:])
-    labels = tf.concat([pos_labels, neg_labels], -1)
+    # create label_ids for softmax
+    label_ids = tf.zeros_like(logits[..., 0], tf.int32)
 
     if mask is None:
         mask = 1.0
@@ -1005,17 +856,27 @@ def tf_loss_softmax(
     if scale_loss:
         # mask loss by prediction confidence
         pos_pred = tf.nn.softmax(logits)[..., 0]
-        mask *= tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
+        scale_mask = mask * tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
+    else:
+        scale_mask = mask
 
-    loss = tf.losses.softmax_cross_entropy(labels, logits, mask)
-    # add regularization losses
-    loss += tf.losses.get_regularization_loss()
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label_ids, logits=logits)
+
+    # scale loss
+    if len(loss.shape) == 2:
+        # average over the sequence
+        loss = tf.reduce_sum(loss * scale_mask, -1) / tf.reduce_sum(mask, -1)
+    else:
+        loss *= scale_mask
+
+    # average the loss over all examples
+    loss = tf.reduce_mean(loss)
 
     return loss
 
 
 # noinspection PyPep8Naming
-def choose_loss(
+def _choose_loss(
     sim_pos: "tf.Tensor",
     sim_neg: "tf.Tensor",
     sim_neg_bot_bot: "tf.Tensor",
@@ -1032,7 +893,7 @@ def choose_loss(
     """Use loss depending on given option."""
 
     if loss_type == "margin":
-        return tf_loss_margin(
+        return _tf_loss_margin(
             sim_pos,
             sim_neg,
             sim_neg_bot_bot,
@@ -1045,7 +906,7 @@ def choose_loss(
             C_emb,
         )
     elif loss_type == "softmax":
-        return tf_loss_softmax(
+        return _tf_loss_softmax(
             sim_pos,
             sim_neg,
             sim_neg_bot_bot,
@@ -1085,10 +946,10 @@ def calculate_loss_acc(
         neg_bot_embed,
         dial_bad_negs,
         bot_bad_negs,
-    ) = sample_negatives(a_embed, b_embed, b_raw, all_b_embed, all_b_raw, num_neg)
+    ) = _sample_negatives(a_embed, b_embed, b_raw, all_b_embed, all_b_raw, num_neg)
 
     # calculate similarities
-    (sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial) = tf_sim(
+    (sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial) = _tf_sim(
         pos_dial_embed,
         pos_bot_embed,
         neg_dial_embed,
@@ -1098,9 +959,9 @@ def calculate_loss_acc(
         mask,
     )
 
-    acc = tf_calc_accuracy(sim_pos, sim_neg)
+    acc = _tf_calc_accuracy(sim_pos, sim_neg)
 
-    loss = choose_loss(
+    loss = _choose_loss(
         sim_pos,
         sim_neg,
         sim_neg_bot_bot,
@@ -1183,13 +1044,11 @@ def output_validation_stat(
 
 
 def train_tf_dataset(
-    train_init_op: "tf.Operation",
-    eval_init_op: "tf.Operation",
+    train_dataset: "tf.data.Dataset",
+    eval_dataset: "tf.data.Dataset",
     batch_size_in: "tf.Tensor",
-    metrics: TrainingMetrics,
-    train_op: "tf.Tensor",
-    session: "tf.Session",
-    is_training: "tf.Session",
+    train: Callable,
+    loss_metric,
     epochs: int,
     batch_size: Union[List[int], int],
     evaluate_on_num_examples: int,
@@ -1198,9 +1057,6 @@ def train_tf_dataset(
 ) -> None:
     """Train tf graph"""
 
-    session.run(tf.global_variables_initializer())
-    session.run(tf.local_variables_initializer())
-
     if evaluate_on_num_examples:
         logger.info(
             f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
@@ -1210,54 +1066,46 @@ def train_tf_dataset(
 
     train_metrics = TrainingMetrics(loss={}, score={})
     val_metrics = TrainingMetrics(loss={}, score={})
-    for ep in pbar:
 
-        ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
+    for ep in pbar:
 
-        session.run(train_init_op, feed_dict={batch_size_in: ep_batch_size})
+        # ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
+        # batchsize_in += ep_batch_size - batch_size_in
 
         ep_train_metrics = TrainingMetrics(
             loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
         )
-        batches_per_epoch = 0
-        while True:
-            try:
-                _, batch_train_metric = session.run(
-                    [train_op, metrics], feed_dict={is_training: True}
-                )
-                batches_per_epoch += 1
-                for name, value in batch_train_metric.loss.items():
-                    ep_train_metrics.loss[name] += value
-                for name, value in batch_train_metric.score.items():
-                    ep_train_metrics.score[name] += value
 
-            except tf.errors.OutOfRangeError:
-                break
+        for batch_in in train_dataset:
+            train(batch_in)
+            # exit()
 
-        for name, value in ep_train_metrics.loss.items():
-            train_metrics.loss[name] = value / batches_per_epoch
-        for name, value in ep_train_metrics.score.items():
-            train_metrics.score[name] = value / batches_per_epoch
+        mean_loss = loss_metric.result()
 
-        postfix_dict = {}
-        postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
+        # for name, value in ep_train_metrics.loss.items():
+        #     train_metrics.loss[name] = value / batches_per_epoch
+        # for name, value in ep_train_metrics.score.items():
+        #     train_metrics.score[name] = value / batches_per_epoch
 
-        if eval_init_op is not None:
-            if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
-                val_metrics = output_validation_stat(
-                    eval_init_op,
-                    metrics,
-                    session,
-                    is_training,
-                    batch_size_in,
-                    ep_batch_size,
-                )
+        postfix_dict = {"loss": mean_loss.numpy()}
+        postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
 
-            postfix_dict = _update_postfix_dict(postfix_dict, val_metrics, "val_")
+        # if eval_init_op is not None:
+        #     if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
+        #         val_metrics = output_validation_stat(
+        #             eval_init_op,
+        #             metrics,
+        #             session,
+        #             is_training,
+        #             batch_size_in,
+        #             ep_batch_size,
+        #         )
+        #
+        #     postfix_dict = _update_postfix_dict(postfix_dict, val_metrics, "val_")
 
         pbar.set_postfix(postfix_dict)
 
-        _write_training_metrics(output_file, ep, train_metrics, val_metrics)
+        # _write_training_metrics(output_file, ep, train_metrics, val_metrics)
 
     logger.info("Finished training.")
 
diff --git a/setup.py b/setup.py
index 4500f730c0e8..533527aad738 100644
--- a/setup.py
+++ b/setup.py
@@ -36,15 +36,10 @@
     "pymongo[tls,srv]~=3.8",
     "numpy~=1.16",
     "scipy~=1.2",
-    # "tensorflow~=1.15.0",
+    "tensorflow~=2.0",
     # absl is a tensorflow dependency, but produces double logging before 0.8
     # should be removed once tensorflow requires absl > 0.8 on its own
     "absl-py>=0.8.0",
-    # setuptools comes from tensorboard requirement:
-    # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
-    "setuptools >= 41.0.0",
-    "tensorflow-probability~=0.7.0",
-    "tensor2tensor~=1.14.0",
     "apscheduler~=3.0",
     "tqdm~=4.0",
     "networkx~=2.3.0",
@@ -84,15 +79,13 @@
     "kafka-python~=1.4",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
-    # remove when tensorflow@1.15.x or a pre-release patch is released
-    # https://github.com/tensorflow/tensorflow/issues/32319
-    "gast==0.2.2",
 ]
 
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
-    "convert": ["tensorflow_text~=1.15.1", "tensorflow_hub~=0.6.0"],
+    # TODO requirements for convert on tf2.0
+    # "convert": ["tensorflow_text~=1.15.1", "tensorflow_hub~=0.6.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
 }

From 1a151ca36b24ef397fd4bb03f05b153cd9048020 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 18 Dec 2019 17:34:16 +0100
Subject: [PATCH 064/633] uncommnet, remove print

---
 .../embedding_intent_classifier.py            | 42 +++++++++----------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f141426267fa..3d7059734ad3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1267,8 +1267,6 @@ def __init__(self,
         self._named_entity_recognition = named_entity_recognition
         self._inverted_tag_dict = inverted_tag_dict
         self._num_tags = len(inverted_tag_dict)
-        print(inverted_tag_dict)
-        exit()
 
         # tf objects
         self._layers = []
@@ -1540,9 +1538,7 @@ def _entity_loss(
         log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
             logits, c, sequence_lengths, self._crf_params
         )
-        tf.print("ll", tf.reduce_max(log_likelihood))
         loss = tf.reduce_mean(-log_likelihood)
-        tf.print("loss", loss)
 
         # CRF preds
         pred_ids, _ = tfa.text.crf.crf_decode(logits, self._crf_params, sequence_lengths)
@@ -1568,23 +1564,23 @@ def _create_metrics(self, batch_in):
 
         metrics = TrainingMetrics(loss={}, score={})
 
-        # if self._masked_lm_loss:
-        #     loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
-        #
-        #     metrics.loss["m_loss"] = loss
-        #     metrics.score["m_acc"] = acc
-
-        # if self._intent_classification:
-        #     last_text = mask_text * tf.math.cumprod(1 - mask_text, axis=1, exclusive=True, reverse=True)
-        #     # get _cls_ vector for intent classification
-        #     cls = tf.reduce_sum(text_transformed * last_text, 1)
-        #     label = self._create_bow(
-        #         tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
-        #     )
-        #     loss, acc = self._intent_loss(cls, label)
-        #
-        #     metrics.loss["i_loss"] = loss
-        #     metrics.score["i_acc"] = acc
+        if self._masked_lm_loss:
+            loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
+
+            metrics.loss["m_loss"] = loss
+            metrics.score["m_acc"] = acc
+
+        if self._intent_classification:
+            last_text = mask_text * tf.math.cumprod(1 - mask_text, axis=1, exclusive=True, reverse=True)
+            # get _cls_ vector for intent classification
+            cls = tf.reduce_sum(text_transformed * last_text, 1)
+            label = self._create_bow(
+                tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
+            )
+            loss, acc = self._intent_loss(cls, label)
+
+            metrics.loss["i_loss"] = loss
+            metrics.score["i_acc"] = acc
 
         if self._named_entity_recognition:
             tags = tf_batch_data["tag_ids"][0]
@@ -1601,8 +1597,8 @@ def train(self, batch_in):
             self._build_all_b()
             metrics = self._create_metrics(batch_in)
             reg_losses = tf.math.add_n([tf.math.add_n(layer.losses) for layer in self._layers if layer.losses])
-            # total_loss = reg_losses
-            total_loss = tf.math.add_n(list(metrics.loss.values()))# + reg_losses
+
+            total_loss = tf.math.add_n(list(metrics.loss.values())) + reg_losses
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))

From 366ba42e098f623ea8b15306a1f842912a179baa Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 18 Dec 2019 18:38:27 +0100
Subject: [PATCH 065/633] add individual losses and accuracies

---
 .../embedding_intent_classifier.py            | 32 ++++++++++++++++---
 rasa/utils/train_utils.py                     | 19 +++++++++--
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3d7059734ad3..f814d699432a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -996,7 +996,13 @@ def train(
             eval_dataset,
             batch_size_in,
             train_func,
-            self.model.loss_metric,
+            [self.model.total_loss_metric,
+             self.model.mask_loss_metric,
+             self.model.intent_loss_metric,
+             self.model.entity_loss_metric,
+             self.model.mask_acc_metric,
+             self.model.intent_acc_metric,
+             self.model.entity_f1_metric],
             self.epochs,
             self.batch_in_size,
             self.evaluate_on_num_examples,
@@ -1346,6 +1352,8 @@ def __init__(self,
                                                     "logits")
         self._layers.extend(self._get_layers(self._embed))
 
+        self.entity_f1 = tfa.metrics.F1Score(num_classes=self._num_tags, average='micro')
+
         # tf tensors
         self.training = tf.ones((), tf.bool)
         initializer = tf.keras.initializers.GlorotUniform()
@@ -1365,9 +1373,15 @@ def __init__(self,
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(learning_rate)
-        self.loss_metric = tf.keras.metrics.Mean(name='t_loss')
+        self.total_loss_metric = tf.keras.metrics.Mean(name='t_loss')
+
+        self.mask_loss_metric = tf.keras.metrics.Mean(name='m_loss')
+        self.intent_loss_metric = tf.keras.metrics.Mean(name='i_loss')
+        self.entity_loss_metric = tf.keras.metrics.Mean(name='e_loss')
+
+        self.mask_acc_metric = tf.keras.metrics.Mean(name='m_acc')
         self.intent_acc_metric = tf.keras.metrics.Mean(name='i_acc')
-        self.entity_f1_metric = tfa.metrics.F1Score(num_classes=self._num_tags, average='micro', name="e_f1")
+        self.entity_f1_metric = tf.keras.metrics.Mean(name='e_f1')
 
     def _combine_sparse_dense_features(
             self,
@@ -1544,7 +1558,7 @@ def _entity_loss(
         pred_ids, _ = tfa.text.crf.crf_decode(logits, self._crf_params, sequence_lengths)
 
         # calculate f1 score for train predictions
-        score = self.entity_f1_metric(c, pred_ids)
+        score = self.entity_f1(c, pred_ids)
 
         return loss, score
 
@@ -1603,4 +1617,12 @@ def train(self, batch_in):
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
-        self.loss_metric.update_state(total_loss)
+        self.total_loss_metric.update_state(total_loss)
+
+        self.mask_loss_metric.update_state(metrics.loss["m_loss"])
+        self.intent_loss_metric.update_state(metrics.loss["i_loss"])
+        self.entity_loss_metric.update_state(metrics.loss["e_loss"])
+
+        self.mask_acc_metric.update_state(metrics.score["m_acc"])
+        self.intent_acc_metric.update_state(metrics.score["i_acc"])
+        self.entity_f1_metric.update_state(metrics.score["e_f1"])
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 653cf9d9fb12..8f55b884f666 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1048,7 +1048,7 @@ def train_tf_dataset(
     eval_dataset: "tf.data.Dataset",
     batch_size_in: "tf.Tensor",
     train: Callable,
-    loss_metric,
+    loss_metrics,
     epochs: int,
     batch_size: Union[List[int], int],
     evaluate_on_num_examples: int,
@@ -1080,14 +1080,27 @@ def train_tf_dataset(
             train(batch_in)
             # exit()
 
-        mean_loss = loss_metric.result()
+        mean_total_loss = loss_metrics[0].result()
+        mean_mask_loss = loss_metrics[1].result()
+        mean_intent_loss = loss_metrics[2].result()
+        mean_entity_loss = loss_metrics[3].result()
+        mean_mask_acc = loss_metrics[4].result()
+        mean_intent_acc = loss_metrics[5].result()
+        mean_entity_f1 = loss_metrics[6].result()
 
         # for name, value in ep_train_metrics.loss.items():
         #     train_metrics.loss[name] = value / batches_per_epoch
         # for name, value in ep_train_metrics.score.items():
         #     train_metrics.score[name] = value / batches_per_epoch
 
-        postfix_dict = {"loss": mean_loss.numpy()}
+        postfix_dict = {"t_loss": mean_total_loss.numpy(),
+                        "m_loss": mean_mask_loss.numpy(),
+                        "i_loss": mean_intent_loss.numpy(),
+                        "e_loss": mean_entity_loss.numpy(),
+                        "m_acc": mean_mask_acc.numpy(),
+                        "i_acc": mean_intent_acc.numpy(),
+                        "e_f1": mean_entity_f1.numpy(),
+                        }
         postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
 
         # if eval_init_op is not None:

From 2d7ab39b5acaba68bd0b40de5901a2a723870cd6 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 19 Dec 2019 13:58:16 +0100
Subject: [PATCH 066/633] fix entity prediction

---
 .../embedding_intent_classifier.py            | 79 ++++++++++---------
 rasa/utils/train_utils.py                     |  1 -
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f814d699432a..6f2e25b9c515 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -991,6 +991,7 @@ def train(
 
         train_func = tf.function(self.model.train, input_signature=[train_dataset.element_spec])
         # train_func = self.model.train
+
         train_utils.train_tf_dataset(
             train_dataset,
             eval_dataset,
@@ -1352,8 +1353,6 @@ def __init__(self,
                                                     "logits")
         self._layers.extend(self._get_layers(self._embed))
 
-        self.entity_f1 = tfa.metrics.F1Score(num_classes=self._num_tags, average='micro')
-
         # tf tensors
         self.training = tf.ones((), tf.bool)
         initializer = tf.keras.initializers.GlorotUniform()
@@ -1381,7 +1380,7 @@ def __init__(self,
 
         self.mask_acc_metric = tf.keras.metrics.Mean(name='m_acc')
         self.intent_acc_metric = tf.keras.metrics.Mean(name='i_acc')
-        self.entity_f1_metric = tf.keras.metrics.Mean(name='e_f1')
+        self.entity_f1_metric = tfa.metrics.F1Score(num_classes=self._num_tags, average='micro')
 
     def _combine_sparse_dense_features(
             self,
@@ -1495,7 +1494,7 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
 
         a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
 
-        return train_utils.calculate_loss_acc(
+        loss, acc = train_utils.calculate_loss_acc(
             a_t_masked_embed,
             a_embed_masked,
             a_masked,
@@ -1510,13 +1509,16 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
             self._C_emb,
             self._scale_loss,
         )
+        self.mask_acc_metric.update_state(acc)
+
+        return loss
 
     def _intent_loss(self, a, b):
 
         a_embed = self._embed["text"](a)
         b_embed = self._embed["intent"](b)
 
-        return train_utils.calculate_loss_acc(
+        loss, acc = train_utils.calculate_loss_acc(
             a_embed,
             b_embed,
             b,
@@ -1531,6 +1533,9 @@ def _intent_loss(self, a, b):
             self._C_emb,
             self._scale_loss,
         )
+        self.intent_acc_metric.update_state(acc)
+
+        return loss
 
     def _entity_loss(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
@@ -1558,9 +1563,16 @@ def _entity_loss(
         pred_ids, _ = tfa.text.crf.crf_decode(logits, self._crf_params, sequence_lengths)
 
         # calculate f1 score for train predictions
-        score = self.entity_f1(c, pred_ids)
+        mask_up_to_last_bool = tf.cast(mask_up_to_last[:, :, 0], tf.bool)
+        c_masked = tf.boolean_mask(c, mask_up_to_last_bool)
+        pred_ids_masked = tf.boolean_mask(pred_ids, mask_up_to_last_bool)
+        # set `0` prediction to not a prediction
+        c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
+        pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        return loss, score
+        self.entity_f1_metric.update_state(c_masked_1, pred_ids_masked_1)
+
+        return loss
 
     def _build_all_b(self):
         if self._intent_classification:
@@ -1569,60 +1581,49 @@ def _build_all_b(self):
             )
             self.all_labels_embed = self._embed["intent"](self.all_labels)
 
-    def _create_metrics(self, batch_in):
+    def _losses(self, batch_in):
         tf_batch_data, _ = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
+
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss)
 
-        metrics = TrainingMetrics(loss={}, score={})
+        losses = {}
 
-        if self._masked_lm_loss:
-            loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
-
-            metrics.loss["m_loss"] = loss
-            metrics.score["m_acc"] = acc
-
-        if self._intent_classification:
-            last_text = mask_text * tf.math.cumprod(1 - mask_text, axis=1, exclusive=True, reverse=True)
-            # get _cls_ vector for intent classification
-            cls = tf.reduce_sum(text_transformed * last_text, 1)
-            label = self._create_bow(
-                tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
-            )
-            loss, acc = self._intent_loss(cls, label)
-
-            metrics.loss["i_loss"] = loss
-            metrics.score["i_acc"] = acc
+        # if self._masked_lm_loss:
+        #     losses["m_loss"] = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
+        #
+        # if self._intent_classification:
+        #     last_text = mask_text * tf.math.cumprod(1 - mask_text, axis=1, exclusive=True, reverse=True)
+        #     # get _cls_ vector for intent classification
+        #     cls = tf.reduce_sum(text_transformed * last_text, 1)
+        #     label = self._create_bow(
+        #         tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
+        #     )
+        #     losses["i_loss"] = self._intent_loss(cls, label)
 
         if self._named_entity_recognition:
             tags = tf_batch_data["tag_ids"][0]
 
-            loss, f1_score = self._entity_loss(text_transformed, tags, mask_text)
-            metrics.loss["e_loss"] = loss
-            metrics.score["e_f1"] = f1_score
+            losses["e_loss"] = self._entity_loss(text_transformed, tags, mask_text)
 
-        return metrics
+        return losses
 
     def train(self, batch_in):
 
         with tf.GradientTape() as tape:
             self._build_all_b()
-            metrics = self._create_metrics(batch_in)
+            losses = self._losses(batch_in)
             reg_losses = tf.math.add_n([tf.math.add_n(layer.losses) for layer in self._layers if layer.losses])
 
-            total_loss = tf.math.add_n(list(metrics.loss.values())) + reg_losses
+            total_loss = tf.math.add_n(list(losses.values())) + reg_losses
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
         self.total_loss_metric.update_state(total_loss)
 
-        self.mask_loss_metric.update_state(metrics.loss["m_loss"])
-        self.intent_loss_metric.update_state(metrics.loss["i_loss"])
-        self.entity_loss_metric.update_state(metrics.loss["e_loss"])
-
-        self.mask_acc_metric.update_state(metrics.score["m_acc"])
-        self.intent_acc_metric.update_state(metrics.score["i_acc"])
-        self.entity_f1_metric.update_state(metrics.score["e_f1"])
+        # self.mask_loss_metric.update_state(metrics.loss["m_loss"])
+        # self.intent_loss_metric.update_state(metrics.loss["i_loss"])
+        self.entity_loss_metric.update_state(losses["e_loss"])
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 8f55b884f666..df039ee81bef 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1078,7 +1078,6 @@ def train_tf_dataset(
 
         for batch_in in train_dataset:
             train(batch_in)
-            # exit()
 
         mean_total_loss = loss_metrics[0].result()
         mean_mask_loss = loss_metrics[1].result()

From 812f3778a92255f569f9c78e67f7327845d0c190 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 19 Dec 2019 15:05:10 +0100
Subject: [PATCH 067/633] get rid of global tensors

---
 .../embedding_intent_classifier.py            | 91 ++++++++-----------
 rasa/utils/train_utils.py                     |  2 +-
 2 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6f2e25b9c515..8ba29824119b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1253,15 +1253,6 @@ def __init__(self,
 
         # options
         self._sparse_input_dropout = sparse_input_dropout
-        # self._hparams = train_utils.create_t2t_hparams(
-        #     num_transformer_layers,
-        #     transformer_size,
-        #     num_heads,
-        #     droprate,
-        #     pos_encoding,
-        #     max_seq_length,
-        #     unidirectional_encoder,
-        # )
         self._num_neg = num_neg
         self._loss_type = loss_type
         self._mu_pos = mu_pos
@@ -1308,7 +1299,6 @@ def __init__(self,
         }
         self._layers.extend(self._get_layers(self._ffnn))
 
-        # noinspection PyUnresolvedReferences
         if num_transformer_layers > 0:
             self._transformer = tf_layers.TransformerEncoder(
                 num_transformer_layers,
@@ -1316,7 +1306,7 @@ def __init__(self,
                 num_heads,
                 transformer_size * 4,
                 self._ffnn["text"].output_dim,
-                256,
+                max_seq_length,
                 droprate
             )
             self._layers.append(self._transformer)
@@ -1366,9 +1356,6 @@ def __init__(self,
             trainable=True,
             name="crf_params"
         )
-        self.all_labels = None
-        self.all_labels_embed = None
-        self.attention_weights = {}
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(learning_rate)
@@ -1466,14 +1453,6 @@ def _create_sequence(self,
             pre, lm_mask_bool = (x, None)
 
         transformed = self._transformer(pre, mask, self.training)
-        # transformed = train_utils.create_t2t_transformer_encoder(
-        #     self._ffnn[name](pre),
-        #     self._pre_transformer,
-        #     mask,
-        #     self.attention_weights,
-        #     self._hparams,
-        #     name,
-        # )
 
         return transformed, x, lm_mask_bool
 
@@ -1513,7 +1492,16 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
 
         return loss
 
+    def _build_all_b(self):
+        all_labels = self._create_bow(
+            self.tf_label_data["intent_features"], self.tf_label_data["intent_mask"][0], "intent"
+        )
+        all_labels_embed = self._embed["intent"](all_labels)
+
+        return all_labels_embed, all_labels
+
     def _intent_loss(self, a, b):
+        all_labels_embed, all_labels = self._build_all_b()
 
         a_embed = self._embed["text"](a)
         b_embed = self._embed["intent"](b)
@@ -1522,8 +1510,8 @@ def _intent_loss(self, a, b):
             a_embed,
             b_embed,
             b,
-            self.all_labels_embed,
-            self.all_labels,
+            all_labels_embed,
+            all_labels,
             self._num_neg,
             None,
             self._loss_type,
@@ -1533,18 +1521,17 @@ def _intent_loss(self, a, b):
             self._C_emb,
             self._scale_loss,
         )
+
         self.intent_acc_metric.update_state(acc)
 
         return loss
 
     def _entity_loss(
-        self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor"
+        self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor", sequence_lengths
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
 
-        mask_up_to_last = 1 - tf.math.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
-        sequence_lengths.set_shape([mask.shape[0]])
-
+        # remove cls token
+        sequence_lengths = tf.maximum(tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
         c = tf.cast(c[:, :, 0], tf.int32)
         logits = self._embed["logits"](a)
 
@@ -1563,9 +1550,9 @@ def _entity_loss(
         pred_ids, _ = tfa.text.crf.crf_decode(logits, self._crf_params, sequence_lengths)
 
         # calculate f1 score for train predictions
-        mask_up_to_last_bool = tf.cast(mask_up_to_last[:, :, 0], tf.bool)
-        c_masked = tf.boolean_mask(c, mask_up_to_last_bool)
-        pred_ids_masked = tf.boolean_mask(pred_ids, mask_up_to_last_bool)
+        mask_bool = tf.cast(mask[:, :, 0], tf.bool)
+        c_masked = tf.boolean_mask(c, mask_bool)
+        pred_ids_masked = tf.boolean_mask(pred_ids, mask_bool)
         # set `0` prediction to not a prediction
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
@@ -1574,46 +1561,42 @@ def _entity_loss(
 
         return loss
 
-    def _build_all_b(self):
-        if self._intent_classification:
-            self.all_labels = self._create_bow(
-                self.tf_label_data["intent_features"], self.tf_label_data["intent_mask"][0], "intent"
-            )
-            self.all_labels_embed = self._embed["intent"](self.all_labels)
-
     def _losses(self, batch_in):
         tf_batch_data, _ = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
+        sequence_lengths.set_shape([mask_text.shape[0]])
 
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss)
 
         losses = {}
 
-        # if self._masked_lm_loss:
-        #     losses["m_loss"] = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
-        #
-        # if self._intent_classification:
-        #     last_text = mask_text * tf.math.cumprod(1 - mask_text, axis=1, exclusive=True, reverse=True)
-        #     # get _cls_ vector for intent classification
-        #     cls = tf.reduce_sum(text_transformed * last_text, 1)
-        #     label = self._create_bow(
-        #         tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
-        #     )
-        #     losses["i_loss"] = self._intent_loss(cls, label)
+        if self._masked_lm_loss:
+            losses["m_loss"] = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
+
+        if self._intent_classification:
+            # get _cls_ vector for intent classification
+            last_index = tf.maximum(tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
+            idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
+            cls = tf.gather_nd(text_transformed, idxs)
+
+            label = self._create_bow(
+                tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
+            )
+            losses["i_loss"] = self._intent_loss(cls, label)
 
         if self._named_entity_recognition:
             tags = tf_batch_data["tag_ids"][0]
 
-            losses["e_loss"] = self._entity_loss(text_transformed, tags, mask_text)
+            losses["e_loss"] = self._entity_loss(text_transformed, tags, mask_text, sequence_lengths)
 
         return losses
 
     def train(self, batch_in):
 
         with tf.GradientTape() as tape:
-            self._build_all_b()
             losses = self._losses(batch_in)
             reg_losses = tf.math.add_n([tf.math.add_n(layer.losses) for layer in self._layers if layer.losses])
 
@@ -1624,6 +1607,6 @@ def train(self, batch_in):
 
         self.total_loss_metric.update_state(total_loss)
 
-        # self.mask_loss_metric.update_state(metrics.loss["m_loss"])
-        # self.intent_loss_metric.update_state(metrics.loss["i_loss"])
+        self.mask_loss_metric.update_state(losses["m_loss"])
+        self.intent_loss_metric.update_state(losses["i_loss"])
         self.entity_loss_metric.update_state(losses["e_loss"])
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index df039ee81bef..8635e2df606c 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -855,7 +855,7 @@ def _tf_loss_softmax(
 
     if scale_loss:
         # mask loss by prediction confidence
-        pos_pred = tf.nn.softmax(logits)[..., 0]
+        pos_pred = tf.stop_gradient(tf.nn.softmax(logits)[..., 0])
         scale_mask = mask * tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
     else:
         scale_mask = mask

From e7cabec590f5a12800274ea939086967550b11a8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 19 Dec 2019 16:05:54 +0100
Subject: [PATCH 068/633] trains seems to be working

---
 rasa/nlu/classifiers/embedding_intent_classifier.py |  1 -
 rasa/utils/train_utils.py                           | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8ba29824119b..a3f6aa17a313 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1566,7 +1566,6 @@ def _losses(self, batch_in):
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
-        sequence_lengths.set_shape([mask_text.shape[0]])
 
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 8635e2df606c..4dce1786f7d9 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1076,9 +1076,20 @@ def train_tf_dataset(
             loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
         )
 
+        # Reset the metrics
+        loss_metrics[0].reset_states()
+        loss_metrics[1].reset_states()
+        loss_metrics[2].reset_states()
+        loss_metrics[3].reset_states()
+        loss_metrics[4].reset_states()
+        loss_metrics[5].reset_states()
+        loss_metrics[6].reset_states()
+
+        # Train on batches
         for batch_in in train_dataset:
             train(batch_in)
 
+        # Get the metric results
         mean_total_loss = loss_metrics[0].result()
         mean_mask_loss = loss_metrics[1].result()
         mean_intent_loss = loss_metrics[2].result()

From 22fa430967ad84f8922d1c88c93424cb10bfe05e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 19 Dec 2019 17:32:39 +0100
Subject: [PATCH 069/633] add if

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a3f6aa17a313..6540b74875f3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1605,7 +1605,9 @@ def train(self, batch_in):
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
         self.total_loss_metric.update_state(total_loss)
-
-        self.mask_loss_metric.update_state(losses["m_loss"])
-        self.intent_loss_metric.update_state(losses["i_loss"])
-        self.entity_loss_metric.update_state(losses["e_loss"])
+        if self._masked_lm_loss:
+            self.mask_loss_metric.update_state(losses["m_loss"])
+        if self._intent_classification:
+            self.intent_loss_metric.update_state(losses["i_loss"])
+        if self._named_entity_recognition:
+            self.entity_loss_metric.update_state(losses["e_loss"])

From 63bc1fd6be917915a151dc8982aaf0bfa150f184 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 20 Dec 2019 11:50:04 +0100
Subject: [PATCH 070/633] shuffle over sequence as well

---
 .../classifiers/embedding_intent_classifier.py    | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6540b74875f3..68565fb5756d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1415,9 +1415,16 @@ def _mask_input(
                 tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
                 * pad_mask_up_to_last
         )
-        a_shuffle = tf.stop_gradient(
-            tf.random.shuffle(a * mask_up_to_last + a_random_pad)
-        )
+        # shuffle over batch dim
+        a_shuffle = tf.random.shuffle(a * mask_up_to_last + a_random_pad)
+
+        # shuffle over sequence dim
+        a_shuffle = tf.transpose(a_shuffle, [1, 0, 2])
+        a_shuffle = tf.random.shuffle(a_shuffle)
+        a_shuffle = tf.transpose(a_shuffle, [1, 0, 2])
+
+        # shuffle doesn't support backprop
+        a_shuffle = tf.stop_gradient(a_shuffle)
 
         a_mask = tf.tile(self._mask_vector, (tf.shape(a)[0], tf.shape(a)[1], 1))
 
@@ -1531,7 +1538,7 @@ def _entity_loss(
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
 
         # remove cls token
-        sequence_lengths = tf.maximum(tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
+        sequence_lengths = sequence_lengths - 1
         c = tf.cast(c[:, :, 0], tf.int32)
         logits = self._embed["logits"](a)
 

From 2346cf9bdd746a94ce7baa992c566b2ec1ec3ceb Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 20 Dec 2019 14:32:39 +0100
Subject: [PATCH 071/633] no bias for pre transformer layer

---
 rasa/utils/tf_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 99c89875840f..5a61662852ce 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -311,7 +311,7 @@ def __init__(self, num_layers, d_model, num_heads, dff, input_dim,
         self.num_layers = num_layers
 
         # TODO use Embed
-        self.embedding = tf.keras.layers.Dense(input_dim=input_dim, units=d_model)
+        self.embedding = tf.keras.layers.Dense(input_dim=input_dim, units=d_model, use_bias=False)
         self.pos_encoding = self._positional_encoding(maximum_position_encoding,
                                                       self.d_model)
 

From 7a38b1a1a97c80334d46e71483a031f8d74a32a4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 21 Dec 2019 02:19:26 +0100
Subject: [PATCH 072/633] first transformer improvements from t2t

---
 .../embedding_intent_classifier.py            |   3 +-
 rasa/utils/tf_layers.py                       | 114 ++++++++++--------
 2 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 68565fb5756d..82c2e4ed4a13 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1459,7 +1459,8 @@ def _create_sequence(self,
         else:
             pre, lm_mask_bool = (x, None)
 
-        transformed = self._transformer(pre, mask, self.training)
+        transformed = self._transformer(pre, 1 - mask, self.training)
+        transformed = tf.nn.relu(transformed)
 
         return transformed, x, lm_mask_bool
 
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 5a61662852ce..944036ff3484 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -153,12 +153,13 @@ def call(self, inputs):
 
 
 # from https://www.tensorflow.org/tutorials/text/transformer
+# and https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/transformer_layers.py#L137
 # TODO add weight regularization (L1)
 # TODO collect losses
 class MultiHeadAttention(tf.keras.layers.Layer):
 
     @staticmethod
-    def _scaled_dot_product_attention(q, k, v, mask):
+    def _scaled_dot_product_attention(q, k, v, pad_mask):
         """Calculate the attention weights.
         q, k, v must have matching leading dimensions.
         k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
@@ -169,7 +170,7 @@ def _scaled_dot_product_attention(q, k, v, mask):
           q: query shape == (..., seq_len_q, depth)
           k: key shape == (..., seq_len_k, depth)
           v: value shape == (..., seq_len_v, depth_v)
-          mask: Float tensor with shape broadcastable
+          pad_mask: Float tensor with shape broadcastable
                 to (..., seq_len_q, seq_len_k). Defaults to None.
 
         Returns:
@@ -180,15 +181,15 @@ def _scaled_dot_product_attention(q, k, v, mask):
 
         # scale matmul_qk
         dk = tf.cast(tf.shape(k)[-1], tf.float32)
-        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+        logits = matmul_qk / tf.math.sqrt(dk)
 
         # add the mask to the scaled tensor.
-        if mask is not None:
-            scaled_attention_logits += (mask * -1e9)
+        if pad_mask is not None:
+            logits += (pad_mask * -1e9)
 
-            # softmax is normalized on the last axis (seq_len_k) so that the scores
+        # softmax is normalized on the last axis (seq_len_k) so that the scores
         # add up to 1.
-        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
+        attention_weights = tf.nn.softmax(logits, axis=-1)  # (..., seq_len_q, seq_len_k)
 
         output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
 
@@ -203,42 +204,49 @@ def __init__(self, d_model, num_heads):
 
         self.depth = d_model // self.num_heads
 
-        self.wq = tf.keras.layers.Dense(d_model)
-        self.wk = tf.keras.layers.Dense(d_model)
-        self.wv = tf.keras.layers.Dense(d_model)
+        self.wq = tf.keras.layers.Dense(d_model, use_bias=False)
+        self.wk = tf.keras.layers.Dense(d_model, use_bias=False)
+        self.wv = tf.keras.layers.Dense(d_model, use_bias=False)
 
-        self.dense = tf.keras.layers.Dense(d_model)
+        self.dense = tf.keras.layers.Dense(d_model, use_bias=False)
 
-    def split_heads(self, x, batch_size):
+    def _split_heads(self, x):
         """Split the last dimension into (num_heads, depth).
         Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
         """
-        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+
+        x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self.depth))
         return tf.transpose(x, perm=[0, 2, 1, 3])
 
-    def call(self, v, k, q, mask):
-        batch_size = tf.shape(q)[0]
+    def _combine_heads(self, x):
+        """Inverse of split_heads.
+
+        Args:
+          x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
+
+        Returns:
+          a Tensor with shape [batch, length, channels]
+        """
+
+        x = tf.transpose(x, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
+        return tf.reshape(x, (tf.shape(x)[0], -1, self.d_model))  # (batch_size, seq_len_q, d_model)
+
+    def call(self, v, k, q, pad_mask):
 
         q = self.wq(q)  # (batch_size, seq_len, d_model)
         k = self.wk(k)  # (batch_size, seq_len, d_model)
         v = self.wv(v)  # (batch_size, seq_len, d_model)
 
-        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
-        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
-        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
+        q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
+        k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
+        v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
 
-        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
+        attention, attention_weights = self._scaled_dot_product_attention(q, k, v, pad_mask)
+        # attention.shape == (batch_size, num_heads, seq_len_q, depth)
         # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
-        scaled_attention, attention_weights = self._scaled_dot_product_attention(
-            q, k, v, mask)
-
-        scaled_attention = tf.transpose(scaled_attention,
-                                        perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
-
-        concat_attention = tf.reshape(scaled_attention,
-                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
+        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, d_model)
 
-        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
+        output = self.dense(attention)  # (batch_size, seq_len_q, d_model)
 
         return output, attention_weights
 
@@ -252,6 +260,7 @@ def __init__(self, d_model, num_heads, dff, rate=0.1):
         self.mha = MultiHeadAttention(d_model, num_heads)
         self.ffn = tf.keras.Sequential([
             tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
+            tf.keras.layers.Dropout(rate),
             tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
         ])
 
@@ -261,27 +270,28 @@ def __init__(self, d_model, num_heads, dff, rate=0.1):
         self.dropout1 = tf.keras.layers.Dropout(rate)
         self.dropout2 = tf.keras.layers.Dropout(rate)
 
-    def call(self, x, mask, training):
-        # mask is (batch_size, 1, 1, seq_len)
-        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
+    def call(self, x, pad_mask, training):
+        x1 = self.layernorm1(x)  # (batch_size, input_seq_len, d_model)
+        attn_output, _ = self.mha(x1, x1, x1, pad_mask)  # (batch_size, input_seq_len, d_model)
         attn_output = self.dropout1(attn_output, training=training)
-        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
+        out1 = x + attn_output
 
-        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
+        out2 = self.layernorm2(out1)  # (batch_size, input_seq_len, d_model)
+        ffn_output = self.ffn(out2)  # (batch_size, input_seq_len, d_model)
         ffn_output = self.dropout2(ffn_output, training=training)
-        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
+        out2 = out1 + ffn_output
 
         return out2
 
 
-def create_look_ahead_mask(size):
-    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
-    return mask  # (seq_len, seq_len)
-
-
 # TODO collect losses
 class TransformerEncoder(tf.keras.layers.Layer):
 
+    @staticmethod
+    def _look_ahead_pad_mask(size):
+        pad_mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
+        return pad_mask[tf.newaxis, tf.newaxis, :, :]   # (1, 1, seq_len, seq_len)
+
     @staticmethod
     def _get_angles(pos, i, d_model):
         angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
@@ -311,30 +321,34 @@ def __init__(self, num_layers, d_model, num_heads, dff, input_dim,
         self.num_layers = num_layers
 
         # TODO use Embed
-        self.embedding = tf.keras.layers.Dense(input_dim=input_dim, units=d_model, use_bias=False)
+        # TODO add unidirectional
+        self.embedding = tf.keras.layers.Dense(input_dim=input_dim,
+                                               units=d_model, use_bias=False)
         self.pos_encoding = self._positional_encoding(maximum_position_encoding,
                                                       self.d_model)
-
+        self.dropout = tf.keras.layers.Dropout(rate)
         self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate)
                            for _ in range(num_layers)]
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
-        self.dropout = tf.keras.layers.Dropout(rate)
-
-    def call(self, x, mask, training):
+    def call(self, x, pad_mask, training):
 
         seq_len = tf.shape(x)[1]
-        mask = tf.squeeze(mask, -1)
-        mask = mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
 
         # adding embedding and position encoding.
         x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
         x += self.pos_encoding[:, :seq_len, :]
-
         x = self.dropout(x, training=training)
+        x *= 1 - pad_mask
 
+        pad_mask = tf.squeeze(pad_mask, -1)
+        pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
         for i in range(self.num_layers):
-            # mask is (batch_size, 1, 1, seq_len)
-            x = self.enc_layers[i](x, mask, training)
+            # padding mask is (batch_size, 1, 1, seq_len)
+            x = self.enc_layers[i](x, pad_mask, training)
 
-        return x  # (batch_size, input_seq_len, d_model)
+        # if normalization is done in layer_preprocess, then it should also be done
+        # on the output, since the output can grow very large, being the sum of
+        # a whole stack of unnormalized layer outputs.
+        return self.layernorm(x)  # (batch_size, input_seq_len, d_model)

From 7fdf0fb9926be4cf119bc15eaa5734585f58b3cc Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 21 Dec 2019 17:20:26 +0100
Subject: [PATCH 073/633] remove keras model

---
 rasa/utils/tf_layers.py | 52 +++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 944036ff3484..5b7d872fad09 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -231,8 +231,7 @@ def _combine_heads(self, x):
         x = tf.transpose(x, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
         return tf.reshape(x, (tf.shape(x)[0], -1, self.d_model))  # (batch_size, seq_len_q, d_model)
 
-    def call(self, v, k, q, pad_mask):
-
+    def call(self, v, k, q, pad_mask=None):
         q = self.wq(q)  # (batch_size, seq_len, d_model)
         k = self.wk(k)  # (batch_size, seq_len, d_model)
         v = self.wv(v)  # (batch_size, seq_len, d_model)
@@ -257,31 +256,31 @@ class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(self, d_model, num_heads, dff, rate=0.1):
         super(TransformerEncoderLayer, self).__init__()
 
+        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
         self.mha = MultiHeadAttention(d_model, num_heads)
-        self.ffn = tf.keras.Sequential([
+        self.dropout = tf.keras.layers.Dropout(rate)
+
+        self.ffn_layers = [
+            tf.keras.layers.LayerNormalization(epsilon=1e-6),
             tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
             tf.keras.layers.Dropout(rate),
-            tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
-        ])
-
-        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-        self.dropout1 = tf.keras.layers.Dropout(rate)
-        self.dropout2 = tf.keras.layers.Dropout(rate)
+            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
+            tf.keras.layers.Dropout(rate),
+        ]
 
     def call(self, x, pad_mask, training):
-        x1 = self.layernorm1(x)  # (batch_size, input_seq_len, d_model)
-        attn_output, _ = self.mha(x1, x1, x1, pad_mask)  # (batch_size, input_seq_len, d_model)
-        attn_output = self.dropout1(attn_output, training=training)
-        out1 = x + attn_output
 
-        out2 = self.layernorm2(out1)  # (batch_size, input_seq_len, d_model)
-        ffn_output = self.ffn(out2)  # (batch_size, input_seq_len, d_model)
-        ffn_output = self.dropout2(ffn_output, training=training)
-        out2 = out1 + ffn_output
+        x_norm = self.layernorm(x)  # (batch_size, input_seq_len, d_model)
+        attn, _ = self.mha(x_norm, x_norm, x_norm, pad_mask)  # (batch_size, input_seq_len, d_model)
+        attn = self.dropout(attn, training=training)
+        x += attn
+
+        ffn = x
+        for layer in self.ffn_layers:
+            ffn = layer(ffn, training=training)  # (batch_size, input_seq_len, d_model)
+        x += ffn
 
-        return out2
+        return x
 
 
 # TODO collect losses
@@ -314,14 +313,13 @@ def _positional_encoding(cls, position, d_model):
         return tf.cast(pos_encoding, dtype=tf.float32)
 
     def __init__(self, num_layers, d_model, num_heads, dff, input_dim,
-                 maximum_position_encoding, rate=0.1):
+                 maximum_position_encoding, rate=0.1, unidirectional=False):
         super(TransformerEncoder, self).__init__()
 
         self.d_model = d_model
-        self.num_layers = num_layers
+        self.unidirectional = unidirectional
 
         # TODO use Embed
-        # TODO add unidirectional
         self.embedding = tf.keras.layers.Dense(input_dim=input_dim,
                                                units=d_model, use_bias=False)
         self.pos_encoding = self._positional_encoding(maximum_position_encoding,
@@ -342,11 +340,15 @@ def call(self, x, pad_mask, training):
         x = self.dropout(x, training=training)
         x *= 1 - pad_mask
 
+        # TODO add unidirectional
         pad_mask = tf.squeeze(pad_mask, -1)
         pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
-        for i in range(self.num_layers):
+        # if self.unidirectional:
+        #     pad_mask *= self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
+
+        for layer in self.enc_layers:
             # padding mask is (batch_size, 1, 1, seq_len)
-            x = self.enc_layers[i](x, pad_mask, training)
+            x = layer(x, pad_mask, training)
 
         # if normalization is done in layer_preprocess, then it should also be done
         # on the output, since the output can grow very large, being the sum of

From 004d800fe83618f47078476056a0e2150924c511 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 21 Dec 2019 19:00:54 +0100
Subject: [PATCH 074/633] remove input dim

---
 .../embedding_intent_classifier.py            | 35 +++++++------------
 rasa/utils/tf_layers.py                       | 27 +++++---------
 2 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 82c2e4ed4a13..d8b1545d3c39 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1184,18 +1184,17 @@ class DIET(tf.Module):
     @staticmethod
     def _create_sparse_dense_layer(values, name, C2, dense_dim):
 
-        input_dim = None
+        sparse = False
         for v in values:
             if isinstance(v[0], scipy.sparse.spmatrix):
-                input_dim = v[0].shape[-1]
+                sparse = True
             else:
                 # if dense features are present
                 # use the feature dimension of the dense features
                 dense_dim = v[0].shape[-1]
 
-        if input_dim:
-            return tf_layers.DenseForSparse(input_dim=input_dim,
-                                            units=dense_dim,
+        if sparse:
+            return tf_layers.DenseForSparse(units=dense_dim,
                                             C2=C2,
                                             name=name)
 
@@ -1282,17 +1281,12 @@ def __init__(self,
         }
         self._layers.extend(self._get_layers(self._sparse_to_dense))
 
-        text_input_dim = self._input_dim(session_data["text_features"], dense_dim)
-        intent_input_dim = self._input_dim(session_data["intent_features"], dense_dim)
-
         self._ffnn = {
-            "text": tf_layers.Ffnn(text_input_dim,
-                                   hidden_layer_sizes["text"],
+            "text": tf_layers.Ffnn(hidden_layer_sizes["text"],
                                    droprate,
                                    C2,
                                    "text_intent" if share_hidden_layers else "text"),
-            "intent": tf_layers.Ffnn(intent_input_dim,
-                                     hidden_layer_sizes["intent"],
+            "intent": tf_layers.Ffnn(hidden_layer_sizes["intent"],
                                      droprate,
                                      C2,
                                      "text_intent" if share_hidden_layers else "intent")
@@ -1305,7 +1299,6 @@ def __init__(self,
                 transformer_size,
                 num_heads,
                 transformer_size * 4,
-                self._ffnn["text"].output_dim,
                 max_seq_length,
                 droprate
             )
@@ -1315,30 +1308,25 @@ def __init__(self,
 
         self._embed = {}
         if self._masked_lm_loss:
-            self._embed["text_mask"] = tf_layers.Embed(transformer_size,
-                                                       embed_dim,
+            self._embed["text_mask"] = tf_layers.Embed(embed_dim,
                                                        C2,
                                                        "text_mask",
                                                        similarity_type)
-            self._embed["text_token"] = tf_layers.Embed(text_input_dim,
-                                                        embed_dim,
+            self._embed["text_token"] = tf_layers.Embed(embed_dim,
                                                         C2,
                                                         "text_token",
                                                         similarity_type)
         if self._intent_classification:
-            self._embed["text"] = tf_layers.Embed(transformer_size,
-                                                  embed_dim,
+            self._embed["text"] = tf_layers.Embed(embed_dim,
                                                   C2,
                                                   "text",
                                                   similarity_type)
-            self._embed["intent"] = tf_layers.Embed(self._ffnn["intent"].output_dim,
-                                                    embed_dim,
+            self._embed["intent"] = tf_layers.Embed(embed_dim,
                                                     C2,
                                                     "intent",
                                                     similarity_type)
         if self._named_entity_recognition:
-            self._embed["logits"] = tf_layers.Embed(transformer_size,
-                                                    self._num_tags,
+            self._embed["logits"] = tf_layers.Embed(self._num_tags,
                                                     C2,
                                                     "logits")
         self._layers.extend(self._get_layers(self._embed))
@@ -1346,6 +1334,7 @@ def __init__(self,
         # tf tensors
         self.training = tf.ones((), tf.bool)
         initializer = tf.keras.initializers.GlorotUniform()
+        text_input_dim = self._input_dim(session_data["text_features"], dense_dim)
         self._mask_vector = tf.Variable(
             initial_value=initializer((1, 1, text_input_dim)),
             trainable=True,
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 5b7d872fad09..310467bd0424 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -78,7 +78,6 @@ class Ffnn(tf.keras.layers.Layer):
     # noinspection PyPep8Naming
     def __init__(
         self,
-        input_dim: int,
         layer_sizes: List[int],
         droprate: float,
         C2: float,
@@ -93,18 +92,14 @@ def __init__(
         for i, layer_size in enumerate(layer_sizes):
             self._layers.append(tf.keras.layers.Dense(
                 units=layer_size,
-                input_dim=input_dim,
                 activation=activation,
                 use_bias=use_bias,
                 kernel_initializer=kernel_initializer,
                 kernel_regularizer=tf.keras.regularizers.l2(C2),
                 name=f"hidden_layer_{layer_name_suffix}_{i}",
             ))
-            input_dim = layer_size
             self._layers.append(tf.keras.layers.Dropout(rate=droprate))
 
-        self.output_dim = input_dim
-
     def call(self, inputs, training):
         x = inputs
         for layer in self._layers:
@@ -119,7 +114,6 @@ class Embed(tf.keras.layers.Layer):
     # noinspection PyPep8Naming
     def __init__(
             self,
-            input_dim: int,
             embed_dim: int,
             C2: float,
             layer_name_suffix: Text,
@@ -136,7 +130,6 @@ def __init__(
 
         self._layers = [tf.keras.layers.Dense(
             units=embed_dim,
-            input_dim=input_dim,
             activation=None,
             kernel_regularizer=tf.keras.regularizers.l2(C2),
             name=f"embed_layer_{layer_name_suffix}",
@@ -312,17 +305,16 @@ def _positional_encoding(cls, position, d_model):
 
         return tf.cast(pos_encoding, dtype=tf.float32)
 
-    def __init__(self, num_layers, d_model, num_heads, dff, input_dim,
-                 maximum_position_encoding, rate=0.1, unidirectional=False):
+    def __init__(self, num_layers, d_model, num_heads, dff,
+                 max_seq_length, rate=0.1, unidirectional=False):
         super(TransformerEncoder, self).__init__()
 
         self.d_model = d_model
         self.unidirectional = unidirectional
 
         # TODO use Embed
-        self.embedding = tf.keras.layers.Dense(input_dim=input_dim,
-                                               units=d_model, use_bias=False)
-        self.pos_encoding = self._positional_encoding(maximum_position_encoding,
+        self.embedding = tf.keras.layers.Dense(units=d_model, use_bias=False)
+        self.pos_encoding = self._positional_encoding(max_seq_length,
                                                       self.d_model)
         self.dropout = tf.keras.layers.Dropout(rate)
         self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate)
@@ -336,18 +328,17 @@ def call(self, x, pad_mask, training):
         # adding embedding and position encoding.
         x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self.pos_encoding[:, :seq_len, :]
+        x += self.pos_encoding[:, :seq_len, :] * (1 - pad_mask)
         x = self.dropout(x, training=training)
-        x *= 1 - pad_mask
 
-        # TODO add unidirectional
         pad_mask = tf.squeeze(pad_mask, -1)
         pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
-        # if self.unidirectional:
-        #     pad_mask *= self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
+        if self.unidirectional:
+            pad_mask = tf.minimum(
+                1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
+            )  # (batch_size, 1, seq_len, seq_len)
 
         for layer in self.enc_layers:
-            # padding mask is (batch_size, 1, 1, seq_len)
             x = layer(x, pad_mask, training)
 
         # if normalization is done in layer_preprocess, then it should also be done

From 3360be5f73f0553b2e706464a9f57ce2bd230b13 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 21 Dec 2019 19:09:01 +0100
Subject: [PATCH 075/633] comments

---
 rasa/utils/tf_layers.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 310467bd0424..37103dc83372 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -146,8 +146,7 @@ def call(self, inputs):
 
 
 # from https://www.tensorflow.org/tutorials/text/transformer
-# and https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/transformer_layers.py#L137
-# TODO add weight regularization (L1)
+# and https://github.com/tensorflow/tensor2tensor
 # TODO collect losses
 class MultiHeadAttention(tf.keras.layers.Layer):
 
@@ -197,10 +196,12 @@ def __init__(self, d_model, num_heads):
 
         self.depth = d_model // self.num_heads
 
+        # TODO add weight regularization (L1)
         self.wq = tf.keras.layers.Dense(d_model, use_bias=False)
         self.wk = tf.keras.layers.Dense(d_model, use_bias=False)
         self.wv = tf.keras.layers.Dense(d_model, use_bias=False)
 
+        # TODO add weight regularization (L2)
         self.dense = tf.keras.layers.Dense(d_model, use_bias=False)
 
     def _split_heads(self, x):
@@ -243,7 +244,6 @@ def call(self, v, k, q, pad_mask=None):
         return output, attention_weights
 
 
-# TODO add weight regularization (L2)
 # TODO collect losses
 class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(self, d_model, num_heads, dff, rate=0.1):
@@ -253,6 +253,7 @@ def __init__(self, d_model, num_heads, dff, rate=0.1):
         self.mha = MultiHeadAttention(d_model, num_heads)
         self.dropout = tf.keras.layers.Dropout(rate)
 
+        # TODO add weight regularization (L2)
         self.ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
             tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
@@ -263,25 +264,25 @@ def __init__(self, d_model, num_heads, dff, rate=0.1):
 
     def call(self, x, pad_mask, training):
 
-        x_norm = self.layernorm(x)  # (batch_size, input_seq_len, d_model)
-        attn, _ = self.mha(x_norm, x_norm, x_norm, pad_mask)  # (batch_size, input_seq_len, d_model)
+        x_norm = self.layernorm(x)  # (batch_size, seq_len, d_model)
+        attn, _ = self.mha(x_norm, x_norm, x_norm, pad_mask)
         attn = self.dropout(attn, training=training)
         x += attn
 
-        ffn = x
+        ffn = x  # (batch_size, seq_len, d_model)
         for layer in self.ffn_layers:
-            ffn = layer(ffn, training=training)  # (batch_size, input_seq_len, d_model)
+            ffn = layer(ffn, training=training)
         x += ffn
 
-        return x
+        return x  # (batch_size, seq_len, d_model)
 
 
 # TODO collect losses
 class TransformerEncoder(tf.keras.layers.Layer):
 
     @staticmethod
-    def _look_ahead_pad_mask(size):
-        pad_mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
+    def _look_ahead_pad_mask(seq_len):
+        pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
         return pad_mask[tf.newaxis, tf.newaxis, :, :]   # (1, 1, seq_len, seq_len)
 
     @staticmethod
@@ -323,17 +324,16 @@ def __init__(self, num_layers, d_model, num_heads, dff,
 
     def call(self, x, pad_mask, training):
 
-        seq_len = tf.shape(x)[1]
-
         # adding embedding and position encoding.
-        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
+        x = self.embedding(x)  # (batch_size, seq_len, d_model)
         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self.pos_encoding[:, :seq_len, :] * (1 - pad_mask)
+        x += self.pos_encoding[:, :tf.shape(x)[1], :] * (1 - pad_mask)
         x = self.dropout(x, training=training)
 
-        pad_mask = tf.squeeze(pad_mask, -1)
+        pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
         pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
         if self.unidirectional:
+            # add look ahead pad mask to emulate unidirectional behavior
             pad_mask = tf.minimum(
                 1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
             )  # (batch_size, 1, seq_len, seq_len)
@@ -341,7 +341,7 @@ def call(self, x, pad_mask, training):
         for layer in self.enc_layers:
             x = layer(x, pad_mask, training)
 
-        # if normalization is done in layer_preprocess, then it should also be done
+        # if normalization is done in encoding layers, then it should also be done
         # on the output, since the output can grow very large, being the sum of
         # a whole stack of unnormalized layer outputs.
-        return self.layernorm(x)  # (batch_size, input_seq_len, d_model)
+        return self.layernorm(x)  # (batch_size, seq_len, d_model)

From 882061718c0fbc3846710c6d294c3c7093bc58e9 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 21 Dec 2019 19:19:35 +0100
Subject: [PATCH 076/633] C2 to reg_lambda in layers

---
 rasa/utils/tf_layers.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 37103dc83372..cc226d25fa2c 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -33,6 +33,7 @@ def call(self, inputs, training):
         to_retain = tf.greater_equal(to_retain_prob, self.rate)
         dropped_inputs = tf.sparse.retain(inputs, to_retain)
         outputs = tf.cond(training, lambda: dropped_inputs, lambda: inputs)
+        # noinspection PyProtectedMember
         outputs._dense_shape = inputs._dense_shape
 
         return outputs
@@ -43,13 +44,11 @@ class DenseForSparse(tf.keras.layers.Dense):
 
     # noinspection PyPep8Naming
     def __init__(self,
-                 C2: float,
-                 activation: Optional[Callable] = tf.nn.relu,
+                 reg_lambda: float,
                  **kwargs):
-        kernel_regularizer = tf.keras.regularizers.l1(C2)
+        kernel_regularizer = tf.keras.regularizers.l1(reg_lambda)
 
         super(DenseForSparse, self).__init__(kernel_regularizer=kernel_regularizer,
-                                             activation=activation,
                                              **kwargs)
 
     def call(self, inputs):
@@ -80,11 +79,8 @@ def __init__(
         self,
         layer_sizes: List[int],
         droprate: float,
-        C2: float,
+        reg_lambda: float,
         layer_name_suffix: Text,
-        activation: Optional[Callable] = tf.nn.relu,
-        use_bias: bool = True,
-        kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
     ):
         super(Ffnn, self).__init__(name=f"ffnn_{layer_name_suffix}")
 
@@ -92,10 +88,8 @@ def __init__(
         for i, layer_size in enumerate(layer_sizes):
             self._layers.append(tf.keras.layers.Dense(
                 units=layer_size,
-                activation=activation,
-                use_bias=use_bias,
-                kernel_initializer=kernel_initializer,
-                kernel_regularizer=tf.keras.regularizers.l2(C2),
+                activation='relu',
+                kernel_regularizer=tf.keras.regularizers.l2(reg_lambda),
                 name=f"hidden_layer_{layer_name_suffix}_{i}",
             ))
             self._layers.append(tf.keras.layers.Dropout(rate=droprate))
@@ -115,7 +109,7 @@ class Embed(tf.keras.layers.Layer):
     def __init__(
             self,
             embed_dim: int,
-            C2: float,
+            reg_lambda: float,
             layer_name_suffix: Text,
             similarity_type: Optional[Text] = None,
     ):
@@ -131,7 +125,7 @@ def __init__(
         self._layers = [tf.keras.layers.Dense(
             units=embed_dim,
             activation=None,
-            kernel_regularizer=tf.keras.regularizers.l2(C2),
+            kernel_regularizer=tf.keras.regularizers.l2(reg_lambda),
             name=f"embed_layer_{layer_name_suffix}",
         )]
 

From a803c31ca90d8d98eae2d2c588fb495a541816ca Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 21 Dec 2019 19:22:38 +0100
Subject: [PATCH 077/633] C2 to reg_lambda in model

---
 .../embedding_intent_classifier.py            | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d8b1545d3c39..9e18952163dc 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1182,7 +1182,7 @@ def load(
 class DIET(tf.Module):
 
     @staticmethod
-    def _create_sparse_dense_layer(values, name, C2, dense_dim):
+    def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
 
         sparse = False
         for v in values:
@@ -1195,7 +1195,7 @@ def _create_sparse_dense_layer(values, name, C2, dense_dim):
 
         if sparse:
             return tf_layers.DenseForSparse(units=dense_dim,
-                                            C2=C2,
+                                            reg_lambda=reg_lambda,
                                             name=name)
 
     @staticmethod
@@ -1227,7 +1227,7 @@ def __init__(self,
                  pos_encoding,
                  max_seq_length,
                  unidirectional_encoder,
-                 C2,
+                 reg_lambda,
                  droprate,
                  sparse_input_dropout,
                  num_neg,
@@ -1272,11 +1272,11 @@ def __init__(self,
         self._sparse_to_dense = {
             "text": self._create_sparse_dense_layer(session_data["text_features"],
                                                     "text",
-                                                    C2,
+                                                    reg_lambda,
                                                     dense_dim),
             "intent": self._create_sparse_dense_layer(session_data["intent_features"],
                                                       "intent",
-                                                      C2,
+                                                      reg_lambda,
                                                       dense_dim),
         }
         self._layers.extend(self._get_layers(self._sparse_to_dense))
@@ -1284,11 +1284,11 @@ def __init__(self,
         self._ffnn = {
             "text": tf_layers.Ffnn(hidden_layer_sizes["text"],
                                    droprate,
-                                   C2,
+                                   reg_lambda,
                                    "text_intent" if share_hidden_layers else "text"),
             "intent": tf_layers.Ffnn(hidden_layer_sizes["intent"],
                                      droprate,
-                                     C2,
+                                     reg_lambda,
                                      "text_intent" if share_hidden_layers else "intent")
         }
         self._layers.extend(self._get_layers(self._ffnn))
@@ -1300,7 +1300,8 @@ def __init__(self,
                 num_heads,
                 transformer_size * 4,
                 max_seq_length,
-                droprate
+                droprate,
+                unidirectional_encoder,
             )
             self._layers.append(self._transformer)
         else:
@@ -1309,25 +1310,25 @@ def __init__(self,
         self._embed = {}
         if self._masked_lm_loss:
             self._embed["text_mask"] = tf_layers.Embed(embed_dim,
-                                                       C2,
+                                                       reg_lambda,
                                                        "text_mask",
                                                        similarity_type)
             self._embed["text_token"] = tf_layers.Embed(embed_dim,
-                                                        C2,
+                                                        reg_lambda,
                                                         "text_token",
                                                         similarity_type)
         if self._intent_classification:
             self._embed["text"] = tf_layers.Embed(embed_dim,
-                                                  C2,
+                                                  reg_lambda,
                                                   "text",
                                                   similarity_type)
             self._embed["intent"] = tf_layers.Embed(embed_dim,
-                                                    C2,
+                                                    reg_lambda,
                                                     "intent",
                                                     similarity_type)
         if self._named_entity_recognition:
             self._embed["logits"] = tf_layers.Embed(self._num_tags,
-                                                    C2,
+                                                    reg_lambda,
                                                     "logits")
         self._layers.extend(self._get_layers(self._embed))
 

From 25d612c68c2b66bcf6791e57e0812461227a7a03 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 12:39:04 +0100
Subject: [PATCH 078/633] add regularizers

---
 rasa/core/events/__init__.py                  |   5 +-
 .../embedding_intent_classifier.py            | 357 +++++++++---------
 rasa/utils/tf_layers.py                       | 222 ++++++-----
 rasa/utils/train_utils.py                     |  26 +-
 4 files changed, 334 insertions(+), 276 deletions(-)

diff --git a/rasa/core/events/__init__.py b/rasa/core/events/__init__.py
index a59651f10d9b..8a0559c0ecbb 100644
--- a/rasa/core/events/__init__.py
+++ b/rasa/core/events/__init__.py
@@ -143,10 +143,7 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List["Event
         return [cls(parameters.get("timestamp"), parameters.get("metadata"))]
 
     def as_dict(self):
-        d = {
-            "event": self.type_name,
-            "timestamp": self.timestamp,
-        }
+        d = {"event": self.type_name, "timestamp": self.timestamp}
 
         if self.metadata:
             d["metadata"] = self.metadata
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9e18952163dc..b536641a0716 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -950,7 +950,7 @@ def train(
         tf.random.set_seed(self.random_seed)
 
         # allows increasing batch size
-        batch_size_in = self.batch_in_size[0] #* tf.ones((), tf.int32)
+        batch_size_in = self.batch_in_size[0]  # * tf.ones((), tf.int32)
 
         train_dataset, eval_dataset = train_utils.create_datasets(
             session_data,
@@ -960,36 +960,40 @@ def train(
             label_key="intent_ids",
         )
 
-        self.model = DIET(session_data,
-                          self._label_data,
-                          self.dense_dim,
-                          self.embed_dim,
-                          self.hidden_layer_sizes,
-                          self.share_hidden_layers,
-                          self.num_transformer_layers,
-                          self.transformer_size,
-                          self.num_heads,
-                          self.pos_encoding,
-                          self.max_seq_length,
-                          self.unidirectional_encoder,
-                          self.C2,
-                          self.droprate,
-                          self.sparse_input_dropout,
-                          self.num_neg,
-                          self.loss_type,
-                          self.mu_pos,
-                          self.mu_neg,
-                          self.use_max_sim_neg,
-                          self.C_emb,
-                          self.scale_loss,
-                          self.similarity_type,
-                          self.masked_lm_loss,
-                          self.intent_classification,
-                          self.named_entity_recognition,
-                          self.inverted_tag_dict,
-                          self.learning_rate)
-
-        train_func = tf.function(self.model.train, input_signature=[train_dataset.element_spec])
+        self.model = DIET(
+            session_data,
+            self._label_data,
+            self.dense_dim,
+            self.embed_dim,
+            self.hidden_layer_sizes,
+            self.share_hidden_layers,
+            self.num_transformer_layers,
+            self.transformer_size,
+            self.num_heads,
+            self.pos_encoding,
+            self.max_seq_length,
+            self.unidirectional_encoder,
+            self.C2,
+            self.droprate,
+            self.sparse_input_dropout,
+            self.num_neg,
+            self.loss_type,
+            self.mu_pos,
+            self.mu_neg,
+            self.use_max_sim_neg,
+            self.C_emb,
+            self.scale_loss,
+            self.similarity_type,
+            self.masked_lm_loss,
+            self.intent_classification,
+            self.named_entity_recognition,
+            self.inverted_tag_dict,
+            self.learning_rate,
+        )
+
+        train_func = tf.function(
+            self.model.train, input_signature=[train_dataset.element_spec]
+        )
         # train_func = self.model.train
 
         train_utils.train_tf_dataset(
@@ -997,13 +1001,15 @@ def train(
             eval_dataset,
             batch_size_in,
             train_func,
-            [self.model.total_loss_metric,
-             self.model.mask_loss_metric,
-             self.model.intent_loss_metric,
-             self.model.entity_loss_metric,
-             self.model.mask_acc_metric,
-             self.model.intent_acc_metric,
-             self.model.entity_f1_metric],
+            [
+                self.model.total_loss_metric,
+                self.model.mask_loss_metric,
+                self.model.intent_loss_metric,
+                self.model.entity_loss_metric,
+                self.model.mask_acc_metric,
+                self.model.intent_acc_metric,
+                self.model.entity_f1_metric,
+            ],
             self.epochs,
             self.batch_in_size,
             self.evaluate_on_num_examples,
@@ -1014,9 +1020,7 @@ def train(
         # rebuild the graph for prediction
         self._build_tf_pred_graph(session_data)
 
-        self.attention_weights = train_utils.extract_attention(
-            self.attention_weights
-        )
+        self.attention_weights = train_utils.extract_attention(self.attention_weights)
 
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
@@ -1179,8 +1183,7 @@ def load(
             return cls(component_config=meta)
 
 
-class DIET(tf.Module):
-
+class DIET(tf.keras.layers.Layer):
     @staticmethod
     def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
 
@@ -1194,9 +1197,9 @@ def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
                 dense_dim = v[0].shape[-1]
 
         if sparse:
-            return tf_layers.DenseForSparse(units=dense_dim,
-                                            reg_lambda=reg_lambda,
-                                            name=name)
+            return tf_layers.DenseForSparse(
+                units=dense_dim, reg_lambda=reg_lambda, name=name
+            )
 
     @staticmethod
     def _input_dim(values, dense_dim):
@@ -1214,41 +1217,45 @@ def _input_dim(values, dense_dim):
     def _get_layers(layers: Dict):
         return [layer for layer in layers.values() if layer is not None]
 
-    def __init__(self,
-                 session_data,
-                 label_data,
-                 dense_dim,
-                 embed_dim,
-                 hidden_layer_sizes,
-                 share_hidden_layers,
-                 num_transformer_layers,
-                 transformer_size,
-                 num_heads,
-                 pos_encoding,
-                 max_seq_length,
-                 unidirectional_encoder,
-                 reg_lambda,
-                 droprate,
-                 sparse_input_dropout,
-                 num_neg,
-                 loss_type,
-                 mu_pos,
-                 mu_neg,
-                 use_max_sim_neg,
-                 C_emb,
-                 scale_loss,
-                 similarity_type,
-                 masked_lm_loss,
-                 intent_classification,
-                 named_entity_recognition,
-                 inverted_tag_dict,
-                 learning_rate):
+    def __init__(
+        self,
+        session_data,
+        label_data,
+        dense_dim,
+        embed_dim,
+        hidden_layer_sizes,
+        share_hidden_layers,
+        num_transformer_layers,
+        transformer_size,
+        num_heads,
+        pos_encoding,
+        max_seq_length,
+        unidirectional_encoder,
+        reg_lambda,
+        droprate,
+        sparse_input_dropout,
+        num_neg,
+        loss_type,
+        mu_pos,
+        mu_neg,
+        use_max_sim_neg,
+        C_emb,
+        scale_loss,
+        similarity_type,
+        masked_lm_loss,
+        intent_classification,
+        named_entity_recognition,
+        inverted_tag_dict,
+        learning_rate,
+    ):
         super(DIET, self).__init__(name="DIET")
 
         # data
         self.session_data = session_data
         label_batch = train_utils.prepare_batch(label_data)
-        self.tf_label_data, _ = train_utils.batch_to_session_data(label_batch, label_data)
+        self.tf_label_data, _ = train_utils.batch_to_session_data(
+            label_batch, label_data
+        )
 
         # options
         self._sparse_input_dropout = sparse_input_dropout
@@ -1266,32 +1273,30 @@ def __init__(self,
         self._num_tags = len(inverted_tag_dict)
 
         # tf objects
-        self._layers = []
-
         self._sparse_dropout = tf_layers.SparseDropout(rate=droprate)
         self._sparse_to_dense = {
-            "text": self._create_sparse_dense_layer(session_data["text_features"],
-                                                    "text",
-                                                    reg_lambda,
-                                                    dense_dim),
-            "intent": self._create_sparse_dense_layer(session_data["intent_features"],
-                                                      "intent",
-                                                      reg_lambda,
-                                                      dense_dim),
+            "text": self._create_sparse_dense_layer(
+                session_data["text_features"], "text", reg_lambda, dense_dim
+            ),
+            "intent": self._create_sparse_dense_layer(
+                session_data["intent_features"], "intent", reg_lambda, dense_dim
+            ),
         }
-        self._layers.extend(self._get_layers(self._sparse_to_dense))
 
         self._ffnn = {
-            "text": tf_layers.Ffnn(hidden_layer_sizes["text"],
-                                   droprate,
-                                   reg_lambda,
-                                   "text_intent" if share_hidden_layers else "text"),
-            "intent": tf_layers.Ffnn(hidden_layer_sizes["intent"],
-                                     droprate,
-                                     reg_lambda,
-                                     "text_intent" if share_hidden_layers else "intent")
+            "text": tf_layers.ReluFfn(
+                hidden_layer_sizes["text"],
+                droprate,
+                reg_lambda,
+                "text_intent" if share_hidden_layers else "text",
+            ),
+            "intent": tf_layers.ReluFfn(
+                hidden_layer_sizes["intent"],
+                droprate,
+                reg_lambda,
+                "text_intent" if share_hidden_layers else "intent",
+            ),
         }
-        self._layers.extend(self._get_layers(self._ffnn))
 
         if num_transformer_layers > 0:
             self._transformer = tf_layers.TransformerEncoder(
@@ -1300,71 +1305,73 @@ def __init__(self,
                 num_heads,
                 transformer_size * 4,
                 max_seq_length,
+                reg_lambda,
                 droprate,
                 unidirectional_encoder,
+                name="text_encoder",
             )
-            self._layers.append(self._transformer)
         else:
             self._transformer = lambda x, mask, training: x
 
         self._embed = {}
         if self._masked_lm_loss:
-            self._embed["text_mask"] = tf_layers.Embed(embed_dim,
-                                                       reg_lambda,
-                                                       "text_mask",
-                                                       similarity_type)
-            self._embed["text_token"] = tf_layers.Embed(embed_dim,
-                                                        reg_lambda,
-                                                        "text_token",
-                                                        similarity_type)
+            self._embed["text_mask"] = tf_layers.Embed(
+                embed_dim, reg_lambda, "text_mask", similarity_type
+            )
+            self._embed["text_token"] = tf_layers.Embed(
+                embed_dim, reg_lambda, "text_token", similarity_type
+            )
         if self._intent_classification:
-            self._embed["text"] = tf_layers.Embed(embed_dim,
-                                                  reg_lambda,
-                                                  "text",
-                                                  similarity_type)
-            self._embed["intent"] = tf_layers.Embed(embed_dim,
-                                                    reg_lambda,
-                                                    "intent",
-                                                    similarity_type)
+            self._embed["text"] = tf_layers.Embed(
+                embed_dim, reg_lambda, "text", similarity_type
+            )
+            self._embed["intent"] = tf_layers.Embed(
+                embed_dim, reg_lambda, "intent", similarity_type
+            )
         if self._named_entity_recognition:
-            self._embed["logits"] = tf_layers.Embed(self._num_tags,
-                                                    reg_lambda,
-                                                    "logits")
-        self._layers.extend(self._get_layers(self._embed))
+            self._embed["logits"] = tf_layers.Embed(
+                self._num_tags, reg_lambda, "logits"
+            )
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
         initializer = tf.keras.initializers.GlorotUniform()
         text_input_dim = self._input_dim(session_data["text_features"], dense_dim)
-        self._mask_vector = tf.Variable(
-            initial_value=initializer((1, 1, text_input_dim)),
+        self._mask_vector = self.add_weight(
+            shape=(1, 1, text_input_dim),
+            initializer=initializer,
             trainable=True,
-            name="mask_vector"
+            name="mask_vector",
         )
-        self._crf_params = tf.Variable(
-            initial_value=initializer((self._num_tags, self._num_tags)),
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._crf_params = self.add_weight(
+            shape=(self._num_tags, self._num_tags),
+            initializer=initializer,
+            regularizer=l2_regularizer,
             trainable=True,
-            name="crf_params"
+            name="crf_params",
         )
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(learning_rate)
-        self.total_loss_metric = tf.keras.metrics.Mean(name='t_loss')
+        self.total_loss_metric = tf.keras.metrics.Mean(name="t_loss")
 
-        self.mask_loss_metric = tf.keras.metrics.Mean(name='m_loss')
-        self.intent_loss_metric = tf.keras.metrics.Mean(name='i_loss')
-        self.entity_loss_metric = tf.keras.metrics.Mean(name='e_loss')
+        self.mask_loss_metric = tf.keras.metrics.Mean(name="m_loss")
+        self.intent_loss_metric = tf.keras.metrics.Mean(name="i_loss")
+        self.entity_loss_metric = tf.keras.metrics.Mean(name="e_loss")
 
-        self.mask_acc_metric = tf.keras.metrics.Mean(name='m_acc')
-        self.intent_acc_metric = tf.keras.metrics.Mean(name='i_acc')
-        self.entity_f1_metric = tfa.metrics.F1Score(num_classes=self._num_tags, average='micro')
+        self.mask_acc_metric = tf.keras.metrics.Mean(name="m_acc")
+        self.intent_acc_metric = tf.keras.metrics.Mean(name="i_acc")
+        self.entity_f1_metric = tfa.metrics.F1Score(
+            num_classes=self._num_tags, average="micro", name="e_f1"
+        )
 
     def _combine_sparse_dense_features(
-            self,
-            features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-            mask: "tf.Tensor",
-            name: Text,
-            sparse_dropout: bool = False,
+        self,
+        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+        mask: "tf.Tensor",
+        name: Text,
+        sparse_dropout: bool = False,
     ) -> "tf.Tensor":
 
         dense_features = []
@@ -1376,34 +1383,36 @@ def _combine_sparse_dense_features(
                 else:
                     _f = f
 
-                dense_features.append(
-                    self._sparse_to_dense[name](_f)
-                )
+                dense_features.append(self._sparse_to_dense[name](_f))
             else:
                 dense_features.append(f)
 
         return tf.concat(dense_features, axis=-1) * mask
 
-    def _create_bow(self,
-                    features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-                    mask: "tf.Tensor",
-                    name: Text):
+    def _create_bow(
+        self,
+        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+        mask: "tf.Tensor",
+        name: Text,
+    ):
 
         x = self._combine_sparse_dense_features(features, mask, name)
         return self._ffnn[name](tf.reduce_sum(x, 1), self.training)
 
     def _mask_input(
-            self, a: "tf.Tensor", mask: "tf.Tensor"
+        self, a: "tf.Tensor", mask: "tf.Tensor"
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Randomly mask input sequences."""
 
         # do not substitute with cls token
-        pad_mask_up_to_last = tf.math.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        pad_mask_up_to_last = tf.math.cumprod(
+            1 - mask, axis=1, exclusive=True, reverse=True
+        )
         mask_up_to_last = 1 - pad_mask_up_to_last
 
         a_random_pad = (
-                tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
-                * pad_mask_up_to_last
+            tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
+            * pad_mask_up_to_last
         )
         # shuffle over batch dim
         a_shuffle = tf.random.shuffle(a * mask_up_to_last + a_random_pad)
@@ -1432,16 +1441,15 @@ def _mask_input(
 
         return a_pre, lm_mask_bool
 
-    def _create_sequence(self,
-                         features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-                         mask: "tf.Tensor",
-                         name: Text,
-                         masked_lm_loss: bool):
+    def _create_sequence(
+        self,
+        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
+        mask: "tf.Tensor",
+        name: Text,
+        masked_lm_loss: bool,
+    ):
         x = self._combine_sparse_dense_features(
-            features,
-            mask,
-            name,
-            sparse_dropout=self._sparse_input_dropout,
+            features, mask, name, sparse_dropout=self._sparse_input_dropout
         )
 
         if masked_lm_loss:
@@ -1492,7 +1500,9 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
 
     def _build_all_b(self):
         all_labels = self._create_bow(
-            self.tf_label_data["intent_features"], self.tf_label_data["intent_mask"][0], "intent"
+            self.tf_label_data["intent_features"],
+            self.tf_label_data["intent_mask"][0],
+            "intent",
         )
         all_labels_embed = self._embed["intent"](all_labels)
 
@@ -1545,7 +1555,9 @@ def _entity_loss(
         loss = tf.reduce_mean(-log_likelihood)
 
         # CRF preds
-        pred_ids, _ = tfa.text.crf.crf_decode(logits, self._crf_params, sequence_lengths)
+        pred_ids, _ = tfa.text.crf.crf_decode(
+            logits, self._crf_params, sequence_lengths
+        )
 
         # calculate f1 score for train predictions
         mask_bool = tf.cast(mask[:, :, 0], tf.bool)
@@ -1559,45 +1571,54 @@ def _entity_loss(
 
         return loss
 
-    def _losses(self, batch_in):
-        tf_batch_data, _ = train_utils.batch_to_session_data(batch_in, self.session_data)
+    def _multi_task_losses(self, batch_in):
+        tf_batch_data, _ = train_utils.batch_to_session_data(
+            batch_in, self.session_data
+        )
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
 
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss)
+            tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss
+        )
 
         losses = {}
 
         if self._masked_lm_loss:
-            losses["m_loss"] = self._mask_loss(text_transformed, text_in, lm_mask_bool_text, "text")
+            losses["m_loss"] = self._mask_loss(
+                text_transformed, text_in, lm_mask_bool_text, "text"
+            )
 
         if self._intent_classification:
             # get _cls_ vector for intent classification
-            last_index = tf.maximum(tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
+            last_index = tf.maximum(
+                tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
+            )
             idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
             cls = tf.gather_nd(text_transformed, idxs)
 
             label = self._create_bow(
-                tf_batch_data["intent_features"], tf_batch_data["intent_mask"][0], "intent"
+                tf_batch_data["intent_features"],
+                tf_batch_data["intent_mask"][0],
+                "intent",
             )
             losses["i_loss"] = self._intent_loss(cls, label)
 
         if self._named_entity_recognition:
             tags = tf_batch_data["tag_ids"][0]
 
-            losses["e_loss"] = self._entity_loss(text_transformed, tags, mask_text, sequence_lengths)
+            losses["e_loss"] = self._entity_loss(
+                text_transformed, tags, mask_text, sequence_lengths
+            )
 
         return losses
 
     def train(self, batch_in):
 
         with tf.GradientTape() as tape:
-            losses = self._losses(batch_in)
-            reg_losses = tf.math.add_n([tf.math.add_n(layer.losses) for layer in self._layers if layer.losses])
-
-            total_loss = tf.math.add_n(list(losses.values())) + reg_losses
+            losses = self._multi_task_losses(batch_in)
+            total_loss = tf.math.add_n(list(losses.values())) + self.losses
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index cc226d25fa2c..59698b8c6c4a 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -22,7 +22,6 @@
 
 
 class SparseDropout(tf.keras.layers.Dropout):
-
     def call(self, inputs, training):
         if training is None:
             training = tf.keras.backend.learning_phase()
@@ -43,20 +42,21 @@ class DenseForSparse(tf.keras.layers.Dense):
     """Dense layer for sparse input tensor"""
 
     # noinspection PyPep8Naming
-    def __init__(self,
-                 reg_lambda: float,
-                 **kwargs):
-        kernel_regularizer = tf.keras.regularizers.l1(reg_lambda)
+    def __init__(self, reg_lambda: float, **kwargs):
+        l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
 
-        super(DenseForSparse, self).__init__(kernel_regularizer=kernel_regularizer,
-                                             **kwargs)
+        super(DenseForSparse, self).__init__(
+            kernel_regularizer=l1_regularizer, **kwargs
+        )
 
     def call(self, inputs):
         if not isinstance(inputs, tf.SparseTensor):
             raise ValueError("Input tensor should be sparse.")
 
         # outputs will be 2D
-        outputs = tf.sparse.sparse_dense_matmul(tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), self.kernel)
+        outputs = tf.sparse.sparse_dense_matmul(
+            tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), self.kernel
+        )
 
         if len(inputs.shape) == 3:
             # reshape back
@@ -71,10 +71,9 @@ def call(self, inputs):
         return outputs
 
 
-class Ffnn(tf.keras.layers.Layer):
-    """Create feed-forward nn with hidden layers and name suffix."""
+class ReluFfn(tf.keras.layers.Layer):
+    """Create feed-forward network with hidden layers and name suffix."""
 
-    # noinspection PyPep8Naming
     def __init__(
         self,
         layer_sizes: List[int],
@@ -82,20 +81,22 @@ def __init__(
         reg_lambda: float,
         layer_name_suffix: Text,
     ):
-        super(Ffnn, self).__init__(name=f"ffnn_{layer_name_suffix}")
+        super(ReluFfn, self).__init__(name=f"ffnn_{layer_name_suffix}")
 
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._layers = []
         for i, layer_size in enumerate(layer_sizes):
-            self._layers.append(tf.keras.layers.Dense(
-                units=layer_size,
-                activation='relu',
-                kernel_regularizer=tf.keras.regularizers.l2(reg_lambda),
-                name=f"hidden_layer_{layer_name_suffix}_{i}",
-            ))
+            self._layers.append(
+                tf.keras.layers.Dense(
+                    units=layer_size,
+                    activation="relu",
+                    kernel_regularizer=l2_regularizer,
+                    name=f"hidden_layer_{layer_name_suffix}_{i}",
+                )
+            )
             self._layers.append(tf.keras.layers.Dropout(rate=droprate))
 
-    def call(self, inputs, training):
-        x = inputs
+    def call(self, x, training):
         for layer in self._layers:
             x = layer(x, training=training)
 
@@ -105,13 +106,12 @@ def call(self, inputs, training):
 class Embed(tf.keras.layers.Layer):
     """Create dense embedding layer with a name."""
 
-    # noinspection PyPep8Naming
     def __init__(
-            self,
-            embed_dim: int,
-            reg_lambda: float,
-            layer_name_suffix: Text,
-            similarity_type: Optional[Text] = None,
+        self,
+        embed_dim: int,
+        reg_lambda: float,
+        layer_name_suffix: Text,
+        similarity_type: Optional[Text] = None,
     ):
         super(Embed, self).__init__(name=f"embed_{layer_name_suffix}")
 
@@ -122,17 +122,16 @@ def __init__(
                 f"should be 'cosine' or 'inner'"
             )
 
-        self._layers = [tf.keras.layers.Dense(
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._dense = tf.keras.layers.Dense(
             units=embed_dim,
             activation=None,
-            kernel_regularizer=tf.keras.regularizers.l2(reg_lambda),
+            kernel_regularizer=l2_regularizer,
             name=f"embed_layer_{layer_name_suffix}",
-        )]
+        )
 
-    def call(self, inputs):
-        x = inputs
-        for layer in self._layers:
-            x = layer(x)
+    def call(self, x):
+        x = self._dense(x)
         if self.similarity_type == "cosine":
             x = tf.nn.l2_normalize(x, -1)
 
@@ -141,9 +140,7 @@ def call(self, inputs):
 
 # from https://www.tensorflow.org/tutorials/text/transformer
 # and https://github.com/tensorflow/tensor2tensor
-# TODO collect losses
 class MultiHeadAttention(tf.keras.layers.Layer):
-
     @staticmethod
     def _scaled_dot_product_attention(q, k, v, pad_mask):
         """Calculate the attention weights.
@@ -171,39 +168,51 @@ def _scaled_dot_product_attention(q, k, v, pad_mask):
 
         # add the mask to the scaled tensor.
         if pad_mask is not None:
-            logits += (pad_mask * -1e9)
+            logits += pad_mask * -1e9
 
         # softmax is normalized on the last axis (seq_len_k) so that the scores
         # add up to 1.
-        attention_weights = tf.nn.softmax(logits, axis=-1)  # (..., seq_len_q, seq_len_k)
+        attention_weights = tf.nn.softmax(
+            logits, axis=-1
+        )  # (..., seq_len_q, seq_len_k)
 
         output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
 
         return output, attention_weights
 
-    def __init__(self, d_model, num_heads):
+    def __init__(self, d_model, num_heads, reg_lambda):
         super(MultiHeadAttention, self).__init__()
         self.num_heads = num_heads
         self.d_model = d_model
 
         assert d_model % self.num_heads == 0
 
-        self.depth = d_model // self.num_heads
+        self._depth = d_model // self.num_heads
 
-        # TODO add weight regularization (L1)
-        self.wq = tf.keras.layers.Dense(d_model, use_bias=False)
-        self.wk = tf.keras.layers.Dense(d_model, use_bias=False)
-        self.wv = tf.keras.layers.Dense(d_model, use_bias=False)
+        l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
+        self._wq = tf.keras.layers.Dense(
+            d_model, use_bias=False, kernel_regularizer=l1_regularizer
+        )
+        self._wk = tf.keras.layers.Dense(
+            d_model, use_bias=False, kernel_regularizer=l1_regularizer
+        )
+        self._wv = tf.keras.layers.Dense(
+            d_model, use_bias=False, kernel_regularizer=l1_regularizer
+        )
 
-        # TODO add weight regularization (L2)
-        self.dense = tf.keras.layers.Dense(d_model, use_bias=False)
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._dense = tf.keras.layers.Dense(
+            d_model, use_bias=False, kernel_regularizer=l2_regularizer
+        )
 
     def _split_heads(self, x):
         """Split the last dimension into (num_heads, depth).
-        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
+
+        Transpose the result such that the shape is
+        (batch_size, num_heads, seq_len, depth)
         """
 
-        x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self.depth))
+        x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth))
         return tf.transpose(x, perm=[0, 2, 1, 3])
 
     def _combine_heads(self, x):
@@ -216,68 +225,75 @@ def _combine_heads(self, x):
           a Tensor with shape [batch, length, channels]
         """
 
-        x = tf.transpose(x, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
-        return tf.reshape(x, (tf.shape(x)[0], -1, self.d_model))  # (batch_size, seq_len_q, d_model)
+        x = tf.transpose(
+            x, perm=[0, 2, 1, 3]
+        )  # (batch_size, seq_len_q, num_heads, depth)
+        return tf.reshape(
+            x, (tf.shape(x)[0], -1, self.d_model)
+        )  # (batch_size, seq_len_q, d_model)
 
     def call(self, v, k, q, pad_mask=None):
-        q = self.wq(q)  # (batch_size, seq_len, d_model)
-        k = self.wk(k)  # (batch_size, seq_len, d_model)
-        v = self.wv(v)  # (batch_size, seq_len, d_model)
+        q = self._wq(q)  # (batch_size, seq_len_q, d_model)
+        k = self._wk(k)  # (batch_size, seq_len_k, d_model)
+        v = self._wv(v)  # (batch_size, seq_len_v, d_model)
 
         q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
         k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
         v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
 
-        attention, attention_weights = self._scaled_dot_product_attention(q, k, v, pad_mask)
+        attention, attention_weights = self._scaled_dot_product_attention(
+            q, k, v, pad_mask
+        )
         # attention.shape == (batch_size, num_heads, seq_len_q, depth)
         # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
         attention = self._combine_heads(attention)  # (batch_size, seq_len_q, d_model)
 
-        output = self.dense(attention)  # (batch_size, seq_len_q, d_model)
+        output = self._dense(attention)  # (batch_size, seq_len_q, d_model)
 
         return output, attention_weights
 
 
-# TODO collect losses
 class TransformerEncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, rate=0.1):
+    def __init__(self, d_model, num_heads, dff, reg_lambda, rate=0.1):
         super(TransformerEncoderLayer, self).__init__()
 
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self.mha = MultiHeadAttention(d_model, num_heads)
-        self.dropout = tf.keras.layers.Dropout(rate)
+        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self._mha = MultiHeadAttention(d_model, num_heads, reg_lambda)
+        self._dropout = tf.keras.layers.Dropout(rate)
 
-        # TODO add weight regularization (L2)
-        self.ffn_layers = [
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
-            tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
+            tf.keras.layers.Dense(
+                dff, activation="relu", kernel_regularizer=l2_regularizer
+            ),  # (batch_size, seq_len, dff)
             tf.keras.layers.Dropout(rate),
-            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
+            tf.keras.layers.Dense(
+                d_model, kernel_regularizer=l2_regularizer
+            ),  # (batch_size, seq_len, d_model)
             tf.keras.layers.Dropout(rate),
         ]
 
     def call(self, x, pad_mask, training):
 
-        x_norm = self.layernorm(x)  # (batch_size, seq_len, d_model)
-        attn, _ = self.mha(x_norm, x_norm, x_norm, pad_mask)
-        attn = self.dropout(attn, training=training)
-        x += attn
+        x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
+        attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
+        attn_out = self._dropout(attn_out, training=training)
+        x += attn_out
 
-        ffn = x  # (batch_size, seq_len, d_model)
-        for layer in self.ffn_layers:
-            ffn = layer(ffn, training=training)
-        x += ffn
+        ffn_out = x  # (batch_size, seq_len, d_model)
+        for layer in self._ffn_layers:
+            ffn_out = layer(ffn_out, training=training)
+        x += ffn_out
 
         return x  # (batch_size, seq_len, d_model)
 
 
-# TODO collect losses
 class TransformerEncoder(tf.keras.layers.Layer):
-
     @staticmethod
     def _look_ahead_pad_mask(seq_len):
         pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
-        return pad_mask[tf.newaxis, tf.newaxis, :, :]   # (1, 1, seq_len, seq_len)
+        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
 
     @staticmethod
     def _get_angles(pos, i, d_model):
@@ -286,9 +302,11 @@ def _get_angles(pos, i, d_model):
 
     @classmethod
     def _positional_encoding(cls, position, d_model):
-        angle_rads = cls._get_angles(np.arange(position)[:, np.newaxis],
-                                     np.arange(d_model)[np.newaxis, :],
-                                     d_model)
+        angle_rads = cls._get_angles(
+            np.arange(position)[:, np.newaxis],
+            np.arange(d_model)[np.newaxis, :],
+            d_model,
+        )
 
         # apply sin to even indices in the array; 2i
         angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
@@ -300,29 +318,45 @@ def _positional_encoding(cls, position, d_model):
 
         return tf.cast(pos_encoding, dtype=tf.float32)
 
-    def __init__(self, num_layers, d_model, num_heads, dff,
-                 max_seq_length, rate=0.1, unidirectional=False):
-        super(TransformerEncoder, self).__init__()
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        num_heads,
+        dff,
+        max_seq_length,
+        reg_lambda,
+        rate=0.1,
+        unidirectional=False,
+        name=None,
+    ):
+        super(TransformerEncoder, self).__init__(name=name)
 
         self.d_model = d_model
         self.unidirectional = unidirectional
 
-        # TODO use Embed
-        self.embedding = tf.keras.layers.Dense(units=d_model, use_bias=False)
-        self.pos_encoding = self._positional_encoding(max_seq_length,
-                                                      self.d_model)
-        self.dropout = tf.keras.layers.Dropout(rate)
-        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate)
-                           for _ in range(num_layers)]
-        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._embedding = tf.keras.layers.Dense(
+            units=d_model, kernel_regularizer=l2_regularizer
+        )
+
+        self._pos_encoding = self._positional_encoding(max_seq_length, self.d_model)
+
+        self._dropout = tf.keras.layers.Dropout(rate)
+
+        self._enc_layers = [
+            TransformerEncoderLayer(d_model, num_heads, dff, reg_lambda, rate)
+            for _ in range(num_layers)
+        ]
+        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
     def call(self, x, pad_mask, training):
 
         # adding embedding and position encoding.
-        x = self.embedding(x)  # (batch_size, seq_len, d_model)
+        x = self._embedding(x)  # (batch_size, seq_len, d_model)
         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self.pos_encoding[:, :tf.shape(x)[1], :] * (1 - pad_mask)
-        x = self.dropout(x, training=training)
+        x += self._pos_encoding[:, : tf.shape(x)[1], :] * (1 - pad_mask)
+        x = self._dropout(x, training=training)
 
         pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
         pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
@@ -332,10 +366,10 @@ def call(self, x, pad_mask, training):
                 1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
             )  # (batch_size, 1, seq_len, seq_len)
 
-        for layer in self.enc_layers:
-            x = layer(x, pad_mask, training)
+        for layer in self._enc_layers:
+            x = layer(x, pad_mask, training)  # (batch_size, seq_len, d_model)
 
         # if normalization is done in encoding layers, then it should also be done
         # on the output, since the output can grow very large, being the sum of
         # a whole stack of unnormalized layer outputs.
-        return self.layernorm(x)  # (batch_size, seq_len, d_model)
+        return self._layernorm(x)  # (batch_size, seq_len, d_model)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 4dce1786f7d9..2b24b3a272cd 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -18,6 +18,7 @@
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
+
 # from tensor2tensor.models.transformer import (
 #     transformer_base,
 #     transformer_prepare_encoder,
@@ -532,7 +533,9 @@ def create_datasets(
     )
 
     if eval_session_data is not None:
-        eval_dataset = create_tf_dataset(eval_session_data, batch_size, label_key=label_key)
+        eval_dataset = create_tf_dataset(
+            eval_session_data, batch_size, label_key=label_key
+        )
     else:
         eval_dataset = None
 
@@ -860,7 +863,9 @@ def _tf_loss_softmax(
     else:
         scale_mask = mask
 
-    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label_ids, logits=logits)
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=label_ids, logits=logits
+    )
 
     # scale loss
     if len(loss.shape) == 2:
@@ -1103,14 +1108,15 @@ def train_tf_dataset(
         # for name, value in ep_train_metrics.score.items():
         #     train_metrics.score[name] = value / batches_per_epoch
 
-        postfix_dict = {"t_loss": mean_total_loss.numpy(),
-                        "m_loss": mean_mask_loss.numpy(),
-                        "i_loss": mean_intent_loss.numpy(),
-                        "e_loss": mean_entity_loss.numpy(),
-                        "m_acc": mean_mask_acc.numpy(),
-                        "i_acc": mean_intent_acc.numpy(),
-                        "e_f1": mean_entity_f1.numpy(),
-                        }
+        postfix_dict = {
+            "t_loss": mean_total_loss.numpy(),
+            "m_loss": mean_mask_loss.numpy(),
+            "i_loss": mean_intent_loss.numpy(),
+            "e_loss": mean_entity_loss.numpy(),
+            "m_acc": mean_mask_acc.numpy(),
+            "i_acc": mean_intent_acc.numpy(),
+            "e_f1": mean_entity_f1.numpy(),
+        }
         postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
 
         # if eval_init_op is not None:

From 6c3f52250f41f47e3bda5d61bf42d8a58c260e30 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 13:21:28 +0100
Subject: [PATCH 079/633] use dict for metrics

---
 .../embedding_intent_classifier.py            | 52 ++++++++-----------
 rasa/utils/tf_layers.py                       |  5 +-
 rasa/utils/train_utils.py                     | 35 ++-----------
 3 files changed, 28 insertions(+), 64 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b536641a0716..d77dcdd78e7b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1001,15 +1001,7 @@ def train(
             eval_dataset,
             batch_size_in,
             train_func,
-            [
-                self.model.total_loss_metric,
-                self.model.mask_loss_metric,
-                self.model.intent_loss_metric,
-                self.model.entity_loss_metric,
-                self.model.mask_acc_metric,
-                self.model.intent_acc_metric,
-                self.model.entity_f1_metric,
-            ],
+            self.model.out_metrics,
             self.epochs,
             self.batch_in_size,
             self.evaluate_on_num_examples,
@@ -1354,17 +1346,19 @@ def __init__(
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(learning_rate)
-        self.total_loss_metric = tf.keras.metrics.Mean(name="t_loss")
-
-        self.mask_loss_metric = tf.keras.metrics.Mean(name="m_loss")
-        self.intent_loss_metric = tf.keras.metrics.Mean(name="i_loss")
-        self.entity_loss_metric = tf.keras.metrics.Mean(name="e_loss")
-
-        self.mask_acc_metric = tf.keras.metrics.Mean(name="m_acc")
-        self.intent_acc_metric = tf.keras.metrics.Mean(name="i_acc")
-        self.entity_f1_metric = tfa.metrics.F1Score(
-            num_classes=self._num_tags, average="micro", name="e_f1"
-        )
+        self.out_metrics = {
+            "t_loss": tf.keras.metrics.Mean(name="t_loss"),
+            "m_loss": tf.keras.metrics.Mean(name="m_loss"),
+            "i_loss": tf.keras.metrics.Mean(name="i_loss"),
+            "e_loss": tf.keras.metrics.Mean(name="e_loss"),
+            "m_acc": tf.keras.metrics.Mean(name="m_acc"),
+            "i_acc": tf.keras.metrics.Mean(name="i_acc"),
+            "e_f1": tfa.metrics.F1Score(
+                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+                average="micro",
+                name="e_f1",
+            ),
+        }
 
     def _combine_sparse_dense_features(
         self,
@@ -1494,7 +1488,8 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
             self._C_emb,
             self._scale_loss,
         )
-        self.mask_acc_metric.update_state(acc)
+        self.out_metrics["m_loss"].update_state(loss)
+        self.out_metrics["m_acc"].update_state(acc)
 
         return loss
 
@@ -1529,8 +1524,8 @@ def _intent_loss(self, a, b):
             self._C_emb,
             self._scale_loss,
         )
-
-        self.intent_acc_metric.update_state(acc)
+        self.out_metrics["i_loss"].update_state(loss)
+        self.out_metrics["i_acc"].update_state(acc)
 
         return loss
 
@@ -1567,7 +1562,8 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        self.entity_f1_metric.update_state(c_masked_1, pred_ids_masked_1)
+        self.out_metrics["e_loss"].update_state(loss)
+        self.out_metrics["e_f1"].update_state(c_masked_1, pred_ids_masked_1)
 
         return loss
 
@@ -1623,10 +1619,4 @@ def train(self, batch_in):
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
-        self.total_loss_metric.update_state(total_loss)
-        if self._masked_lm_loss:
-            self.mask_loss_metric.update_state(losses["m_loss"])
-        if self._intent_classification:
-            self.intent_loss_metric.update_state(losses["i_loss"])
-        if self._named_entity_recognition:
-            self.entity_loss_metric.update_state(losses["e_loss"])
+        self.out_metrics["t_loss"].update_state(total_loss)
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 59698b8c6c4a..468ca30f3378 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -140,6 +140,7 @@ def call(self, x):
 
 # from https://www.tensorflow.org/tutorials/text/transformer
 # and https://github.com/tensorflow/tensor2tensor
+# TODO implement relative attention
 class MultiHeadAttention(tf.keras.layers.Layer):
     @staticmethod
     def _scaled_dot_product_attention(q, k, v, pad_mask):
@@ -201,9 +202,7 @@ def __init__(self, d_model, num_heads, reg_lambda):
         )
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self._dense = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l2_regularizer
-        )
+        self._dense = tf.keras.layers.Dense(d_model, kernel_regularizer=l2_regularizer)
 
     def _split_heads(self, x):
         """Split the last dimension into (num_heads, depth).
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2b24b3a272cd..e633bb231dcf 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1053,7 +1053,7 @@ def train_tf_dataset(
     eval_dataset: "tf.data.Dataset",
     batch_size_in: "tf.Tensor",
     train: Callable,
-    loss_metrics,
+    metrics,
     epochs: int,
     batch_size: Union[List[int], int],
     evaluate_on_num_examples: int,
@@ -1082,41 +1082,16 @@ def train_tf_dataset(
         )
 
         # Reset the metrics
-        loss_metrics[0].reset_states()
-        loss_metrics[1].reset_states()
-        loss_metrics[2].reset_states()
-        loss_metrics[3].reset_states()
-        loss_metrics[4].reset_states()
-        loss_metrics[5].reset_states()
-        loss_metrics[6].reset_states()
+        for metric in metrics.values():
+            metric.reset_states()
 
         # Train on batches
         for batch_in in train_dataset:
             train(batch_in)
 
         # Get the metric results
-        mean_total_loss = loss_metrics[0].result()
-        mean_mask_loss = loss_metrics[1].result()
-        mean_intent_loss = loss_metrics[2].result()
-        mean_entity_loss = loss_metrics[3].result()
-        mean_mask_acc = loss_metrics[4].result()
-        mean_intent_acc = loss_metrics[5].result()
-        mean_entity_f1 = loss_metrics[6].result()
-
-        # for name, value in ep_train_metrics.loss.items():
-        #     train_metrics.loss[name] = value / batches_per_epoch
-        # for name, value in ep_train_metrics.score.items():
-        #     train_metrics.score[name] = value / batches_per_epoch
-
-        postfix_dict = {
-            "t_loss": mean_total_loss.numpy(),
-            "m_loss": mean_mask_loss.numpy(),
-            "i_loss": mean_intent_loss.numpy(),
-            "e_loss": mean_entity_loss.numpy(),
-            "m_acc": mean_mask_acc.numpy(),
-            "i_acc": mean_intent_acc.numpy(),
-            "e_f1": mean_entity_f1.numpy(),
-        }
+        postfix_dict = {k: v.result().numpy() for k, v in metrics.items()}
+
         postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
 
         # if eval_init_op is not None:

From dd5b4c5ff0998aa458e8a92676bafa51f6797de5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 16:04:55 +0100
Subject: [PATCH 080/633] add increasing batch size

---
 .../embedding_intent_classifier.py            |  48 +++---
 rasa/utils/train_utils.py                     | 153 ++----------------
 2 files changed, 41 insertions(+), 160 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d77dcdd78e7b..3d02bc66912e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -949,19 +949,9 @@ def train(
         # set random seed
         tf.random.set_seed(self.random_seed)
 
-        # allows increasing batch size
-        batch_size_in = self.batch_in_size[0]  # * tf.ones((), tf.int32)
-
-        train_dataset, eval_dataset = train_utils.create_datasets(
-            session_data,
-            eval_session_data,
-            batch_size_in,
-            self.batch_in_strategy,
-            label_key="intent_ids",
-        )
-
         self.model = DIET(
             session_data,
+            eval_session_data,
             self._label_data,
             self.dense_dim,
             self.embed_dim,
@@ -970,7 +960,6 @@ def train(
             self.num_transformer_layers,
             self.transformer_size,
             self.num_heads,
-            self.pos_encoding,
             self.max_seq_length,
             self.unidirectional_encoder,
             self.C2,
@@ -989,19 +978,11 @@ def train(
             self.named_entity_recognition,
             self.inverted_tag_dict,
             self.learning_rate,
+            self.batch_in_strategy,
         )
 
-        train_func = tf.function(
-            self.model.train, input_signature=[train_dataset.element_spec]
-        )
-        # train_func = self.model.train
-
         train_utils.train_tf_dataset(
-            train_dataset,
-            eval_dataset,
-            batch_size_in,
-            train_func,
-            self.model.out_metrics,
+            self.model,
             self.epochs,
             self.batch_in_size,
             self.evaluate_on_num_examples,
@@ -1212,6 +1193,7 @@ def _get_layers(layers: Dict):
     def __init__(
         self,
         session_data,
+        eval_session_data,
         label_data,
         dense_dim,
         embed_dim,
@@ -1220,7 +1202,6 @@ def __init__(
         num_transformer_layers,
         transformer_size,
         num_heads,
-        pos_encoding,
         max_seq_length,
         unidirectional_encoder,
         reg_lambda,
@@ -1239,11 +1220,13 @@ def __init__(
         named_entity_recognition,
         inverted_tag_dict,
         learning_rate,
+        batch_in_strategy,
     ):
         super(DIET, self).__init__(name="DIET")
 
         # data
         self.session_data = session_data
+        self.eval_session_data = eval_session_data
         label_batch = train_utils.prepare_batch(label_data)
         self.tf_label_data, _ = train_utils.batch_to_session_data(
             label_batch, label_data
@@ -1263,6 +1246,7 @@ def __init__(
         self._named_entity_recognition = named_entity_recognition
         self._inverted_tag_dict = inverted_tag_dict
         self._num_tags = len(inverted_tag_dict)
+        self._batch_in_strategy = batch_in_strategy
 
         # tf objects
         self._sparse_dropout = tf_layers.SparseDropout(rate=droprate)
@@ -1611,7 +1595,6 @@ def _multi_task_losses(self, batch_in):
         return losses
 
     def train(self, batch_in):
-
         with tf.GradientTape() as tape:
             losses = self._multi_task_losses(batch_in)
             total_loss = tf.math.add_n(list(losses.values())) + self.losses
@@ -1620,3 +1603,20 @@ def train(self, batch_in):
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
         self.out_metrics["t_loss"].update_state(total_loss)
+
+    def train_dataset(self, batch_size):
+        return train_utils.create_tf_dataset(
+            self.session_data,
+            batch_size,
+            label_key="intent_ids",
+            batch_strategy=self._batch_in_strategy,
+            shuffle=True,
+        )
+
+    def eval_dataset(self, batch_size):
+        if self.eval_session_data is not None:
+            return train_utils.create_tf_dataset(
+                self.eval_session_data,
+                batch_size,
+                label_key="intent_ids",
+            )
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e633bb231dcf..fc0344035523 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -515,121 +515,6 @@ def append_type(v: np.ndarray):
     return tuple(shapes), tuple(types)
 
 
-def create_datasets(
-    session_data: SessionDataType,
-    eval_session_data: SessionDataType,
-    batch_size: Union["tf.Tensor", int],
-    batch_strategy: Text,
-    label_key: Text,
-) -> Tuple["tf.data.Dataset", "tf.data.Dataset"]:
-    """Create iterator and init datasets."""
-
-    train_dataset = create_tf_dataset(
-        session_data,
-        batch_size,
-        label_key=label_key,
-        batch_strategy=batch_strategy,
-        shuffle=True,
-    )
-
-    if eval_session_data is not None:
-        eval_dataset = create_tf_dataset(
-            eval_session_data, batch_size, label_key=label_key
-        )
-    else:
-        eval_dataset = None
-
-    return train_dataset, eval_dataset
-
-
-# def create_t2t_hparams(
-#     num_transformer_layers: int,
-#     transformer_size: int,
-#     num_heads: int,
-#     droprate: float,
-#     pos_encoding: Text,
-#     max_seq_length: int,
-#     unidirectional_encoder: bool = True,
-# ) -> "HParams":
-#     """Create parameters for t2t transformer."""
-#
-#     hparams = transformer_base()
-#
-#     hparams.num_hidden_layers = num_transformer_layers
-#     hparams.hidden_size = transformer_size
-#     # it seems to be factor of 4 for transformer architectures in t2t
-#     hparams.filter_size = hparams.hidden_size * 4
-#     hparams.num_heads = num_heads
-#     hparams.relu_dropout = droprate
-#     hparams.pos = pos_encoding
-#
-#     hparams.max_length = max_seq_length
-#
-#     hparams.unidirectional_encoder = unidirectional_encoder
-#
-#     hparams.self_attention_type = "dot_product_relative_v2"
-#     hparams.max_relative_position = 5
-#     hparams.add_relative_to_values = True
-#
-#     # When not in training mode, set all forms of dropout to zero.
-#     training = tf.keras.backend.learning_phase()
-#     for key, value in hparams.values().items():
-#         if key.endswith("dropout") or key == "label_smoothing":
-#             setattr(hparams, key, value * tf.cast(training, tf.float32))
-#
-#     return hparams
-#
-#
-# # noinspection PyUnresolvedReferences
-# def create_t2t_transformer_encoder(
-#     x_in: "tf.Tensor",
-#     pre_transformer: "tf.keras.layers.Layer",
-#     mask: "tf.Tensor",
-#     attention_weights: Dict[Text, "tf.Tensor"],
-#     hparams: "HParams",
-#     name: Text,
-# ) -> "tf.Tensor":
-#     """Create t2t transformer encoder."""
-#     with tf.variable_scope(f"transformer_{name}", reuse=tf.AUTO_REUSE):
-#         if len(mask.shape) == 2:
-#             _mask = tf.expand_dims(mask, -1)
-#         else:
-#             _mask = mask
-#
-#         x = pre_transformer(x_in)
-#
-#         if hparams.multiply_embedding_mode == "sqrt_depth":
-#             x *= hparams.hidden_size ** 0.5
-#
-#         (
-#             x,
-#             self_attention_bias,
-#             encoder_decoder_attention_bias,
-#         ) = transformer_prepare_encoder(x, None, hparams)
-#
-#         x *= _mask
-#
-#         x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
-#
-#         attn_bias_for_padding = None
-#         # Otherwise the encoder will just use encoder_self_attention_bias.
-#         if hparams.unidirectional_encoder:
-#             attn_bias_for_padding = encoder_decoder_attention_bias
-#
-#         x = transformer_encoder(
-#             x,
-#             self_attention_bias,
-#             hparams,
-#             nonpadding=_mask,
-#             save_weights_to=attention_weights,
-#             attn_bias_for_padding=attn_bias_for_padding,
-#         )
-#
-#         x *= _mask
-#
-#         return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
-
-
 def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":
     """Make tensor 2D."""
 
@@ -1049,11 +934,7 @@ def output_validation_stat(
 
 
 def train_tf_dataset(
-    train_dataset: "tf.data.Dataset",
-    eval_dataset: "tf.data.Dataset",
-    batch_size_in: "tf.Tensor",
-    train: Callable,
-    metrics,
+    model,
     epochs: int,
     batch_size: Union[List[int], int],
     evaluate_on_num_examples: int,
@@ -1069,32 +950,32 @@ def train_tf_dataset(
         )
     pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
 
-    train_metrics = TrainingMetrics(loss={}, score={})
-    val_metrics = TrainingMetrics(loss={}, score={})
+    train_dataset_func = tf.function(model.train_dataset)
+    eval_dataset_func = tf.function(model.eval_dataset)
 
-    for ep in pbar:
+    tf_batch_size = tf.ones((), tf.int32)
+    train_func = tf.function(
+        model.train, input_signature=[train_dataset_func(tf_batch_size).element_spec]
+    )
+    # train_func = self.model.train
 
-        # ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
-        # batchsize_in += ep_batch_size - batch_size_in
+    for ep in pbar:
 
-        ep_train_metrics = TrainingMetrics(
-            loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
-        )
+        # allows increasing batch size
+        ep_batch_size = tf_batch_size * linearly_increasing_batch_size(ep, batch_size, epochs)
 
         # Reset the metrics
-        for metric in metrics.values():
+        for metric in model.out_metrics.values():
             metric.reset_states()
 
         # Train on batches
-        for batch_in in train_dataset:
-            train(batch_in)
+        for batch_in in train_dataset_func(ep_batch_size):
+            train_func(batch_in)
 
         # Get the metric results
-        postfix_dict = {k: v.result().numpy() for k, v in metrics.items()}
-
-        postfix_dict = _update_postfix_dict(postfix_dict, train_metrics)
+        postfix_dict = {k: v.result().numpy() for k, v in model.out_metrics.items()}
 
-        # if eval_init_op is not None:
+        # if eval_dataset_func(ep_batch_size) is not None:
         #     if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
         #         val_metrics = output_validation_stat(
         #             eval_init_op,
@@ -1115,7 +996,7 @@ def train_tf_dataset(
 
 
 def _update_postfix_dict(
-    postfix_dict: Dict[Text, Text], metrics: TrainingMetrics, prefix: Text = ""
+    postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
 ) -> Dict[Text, Text]:
     for name, value in metrics.loss.items():
         postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"

From f5bd29e4b1ce82c58278c64990ac98304726360e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 16:44:25 +0100
Subject: [PATCH 081/633] add validation

---
 .../embedding_intent_classifier.py            | 93 +++++++++++--------
 rasa/utils/train_utils.py                     | 50 ++++++----
 2 files changed, 84 insertions(+), 59 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3d02bc66912e..bdc8323fe4d9 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1186,10 +1186,6 @@ def _input_dim(values, dense_dim):
 
         return dense_dim * len(values)
 
-    @staticmethod
-    def _get_layers(layers: Dict):
-        return [layer for layer in layers.values() if layer is not None]
-
     def __init__(
         self,
         session_data,
@@ -1290,6 +1286,8 @@ def __init__(
             self._transformer = lambda x, mask, training: x
 
         self._embed = {}
+        self.train_metrics = {"t_loss": tf.keras.metrics.Mean(name="t_loss")}
+        self.eval_metrics = {"val_t_loss": tf.keras.metrics.Mean(name="val_t_loss")}
         if self._masked_lm_loss:
             self._embed["text_mask"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "text_mask", similarity_type
@@ -1297,6 +1295,11 @@ def __init__(
             self._embed["text_token"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "text_token", similarity_type
             )
+            self.train_metrics["m_loss"] = tf.keras.metrics.Mean(name="m_loss")
+            self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
+            self.eval_metrics["val_m_loss"] = tf.keras.metrics.Mean(name="val_m_loss")
+            self.eval_metrics["val_m_acc"] = tf.keras.metrics.Mean(name="val_m_acc")
+
         if self._intent_classification:
             self._embed["text"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "text", similarity_type
@@ -1304,10 +1307,19 @@ def __init__(
             self._embed["intent"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "intent", similarity_type
             )
+            self.train_metrics["i_loss"] = tf.keras.metrics.Mean(name="i_loss")
+            self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
+            self.eval_metrics["val_i_loss"] = tf.keras.metrics.Mean(name="val_i_loss")
+            self.eval_metrics["val_i_acc"] = tf.keras.metrics.Mean(name="val_i_acc")
+
         if self._named_entity_recognition:
             self._embed["logits"] = tf_layers.Embed(
                 self._num_tags, reg_lambda, "logits"
             )
+            self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
+            self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
+            self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
+            self.eval_metrics["val_e_f1"] = tf.keras.metrics.Mean(name="val_e_f1")
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
@@ -1330,19 +1342,10 @@ def __init__(
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(learning_rate)
-        self.out_metrics = {
-            "t_loss": tf.keras.metrics.Mean(name="t_loss"),
-            "m_loss": tf.keras.metrics.Mean(name="m_loss"),
-            "i_loss": tf.keras.metrics.Mean(name="i_loss"),
-            "e_loss": tf.keras.metrics.Mean(name="e_loss"),
-            "m_acc": tf.keras.metrics.Mean(name="m_acc"),
-            "i_acc": tf.keras.metrics.Mean(name="i_acc"),
-            "e_f1": tfa.metrics.F1Score(
-                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
-                average="micro",
-                name="e_f1",
-            ),
-        }
+        self.entity_f1 = tfa.metrics.F1Score(
+            num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+            average="micro",
+        )
 
     def _combine_sparse_dense_features(
         self,
@@ -1457,7 +1460,7 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
 
         a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
 
-        loss, acc = train_utils.calculate_loss_acc(
+        return train_utils.calculate_loss_acc(
             a_t_masked_embed,
             a_embed_masked,
             a_masked,
@@ -1472,10 +1475,6 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
             self._C_emb,
             self._scale_loss,
         )
-        self.out_metrics["m_loss"].update_state(loss)
-        self.out_metrics["m_acc"].update_state(acc)
-
-        return loss
 
     def _build_all_b(self):
         all_labels = self._create_bow(
@@ -1493,7 +1492,7 @@ def _intent_loss(self, a, b):
         a_embed = self._embed["text"](a)
         b_embed = self._embed["intent"](b)
 
-        loss, acc = train_utils.calculate_loss_acc(
+        return train_utils.calculate_loss_acc(
             a_embed,
             b_embed,
             b,
@@ -1508,10 +1507,6 @@ def _intent_loss(self, a, b):
             self._C_emb,
             self._scale_loss,
         )
-        self.out_metrics["i_loss"].update_state(loss)
-        self.out_metrics["i_acc"].update_state(acc)
-
-        return loss
 
     def _entity_loss(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor", sequence_lengths
@@ -1546,12 +1541,11 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        self.out_metrics["e_loss"].update_state(loss)
-        self.out_metrics["e_f1"].update_state(c_masked_1, pred_ids_masked_1)
+        f1 = self.entity_f1(c_masked_1, pred_ids_masked_1)
 
-        return loss
+        return loss, f1
 
-    def _multi_task_losses(self, batch_in):
+    def _train_losses_scores(self, batch_in):
         tf_batch_data, _ = train_utils.batch_to_session_data(
             batch_in, self.session_data
         )
@@ -1564,11 +1558,14 @@ def _multi_task_losses(self, batch_in):
         )
 
         losses = {}
+        scores = {}
 
         if self._masked_lm_loss:
-            losses["m_loss"] = self._mask_loss(
+            loss, acc = self._mask_loss(
                 text_transformed, text_in, lm_mask_bool_text, "text"
             )
+            losses["m_loss"] = loss
+            scores["m_acc"] = acc
 
         if self._intent_classification:
             # get _cls_ vector for intent classification
@@ -1583,26 +1580,34 @@ def _multi_task_losses(self, batch_in):
                 tf_batch_data["intent_mask"][0],
                 "intent",
             )
-            losses["i_loss"] = self._intent_loss(cls, label)
+            loss, acc = self._intent_loss(cls, label)
+            losses["i_loss"] = loss
+            scores["i_acc"] = acc
 
         if self._named_entity_recognition:
             tags = tf_batch_data["tag_ids"][0]
 
-            losses["e_loss"] = self._entity_loss(
+            loss, f1 = self._entity_loss(
                 text_transformed, tags, mask_text, sequence_lengths
             )
+            losses["e_loss"] = loss
+            scores["e_f1"] = f1
 
-        return losses
+        return losses, scores
 
     def train(self, batch_in):
         with tf.GradientTape() as tape:
-            losses = self._multi_task_losses(batch_in)
+            losses, scores = self._train_losses_scores(batch_in)
             total_loss = tf.math.add_n(list(losses.values())) + self.losses
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
-        self.out_metrics["t_loss"].update_state(total_loss)
+        self.train_metrics["t_loss"].update_state(total_loss)
+        for k, v in losses.items():
+            self.train_metrics[k].update_state(v)
+        for k, v in scores.items():
+            self.train_metrics[k].update_state(v)
 
     def train_dataset(self, batch_size):
         return train_utils.create_tf_dataset(
@@ -1613,10 +1618,18 @@ def train_dataset(self, batch_size):
             shuffle=True,
         )
 
+    def eval(self, batch_in):
+        losses, scores = self._train_losses_scores(batch_in)
+        total_loss = tf.math.add_n(list(losses.values())) + self.losses
+
+        self.eval_metrics["val_t_loss"].update_state(total_loss)
+        for k, v in losses.items():
+            self.eval_metrics[f"val_{k}"].update_state(v)
+        for k, v in scores.items():
+            self.eval_metrics[f"val_{k}"].update_state(v)
+
     def eval_dataset(self, batch_size):
         if self.eval_session_data is not None:
             return train_utils.create_tf_dataset(
-                self.eval_session_data,
-                batch_size,
-                label_key="intent_ids",
+                self.eval_session_data, batch_size, label_key="intent_ids"
             )
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index fc0344035523..a34f42069ac6 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -950,6 +950,7 @@ def train_tf_dataset(
         )
     pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
 
+    # allows increasing batch size
     train_dataset_func = tf.function(model.train_dataset)
     eval_dataset_func = tf.function(model.eval_dataset)
 
@@ -957,15 +958,20 @@ def train_tf_dataset(
     train_func = tf.function(
         model.train, input_signature=[train_dataset_func(tf_batch_size).element_spec]
     )
-    # train_func = self.model.train
+    if evaluate_on_num_examples > 0:
+        eval_func = tf.function(
+            model.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
+        )
+    else:
+        eval_func = None
 
     for ep in pbar:
-
-        # allows increasing batch size
-        ep_batch_size = tf_batch_size * linearly_increasing_batch_size(ep, batch_size, epochs)
+        ep_batch_size = tf_batch_size * linearly_increasing_batch_size(
+            ep, batch_size, epochs
+        )
 
         # Reset the metrics
-        for metric in model.out_metrics.values():
+        for metric in model.train_metrics.values():
             metric.reset_states()
 
         # Train on batches
@@ -973,20 +979,26 @@ def train_tf_dataset(
             train_func(batch_in)
 
         # Get the metric results
-        postfix_dict = {k: v.result().numpy() for k, v in model.out_metrics.items()}
-
-        # if eval_dataset_func(ep_batch_size) is not None:
-        #     if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
-        #         val_metrics = output_validation_stat(
-        #             eval_init_op,
-        #             metrics,
-        #             session,
-        #             is_training,
-        #             batch_size_in,
-        #             ep_batch_size,
-        #         )
-        #
-        #     postfix_dict = _update_postfix_dict(postfix_dict, val_metrics, "val_")
+        postfix_dict = {k: v.result().numpy() for k, v in model.train_metrics.items()}
+
+        if evaluate_on_num_examples > 0:
+            if (
+                ep == 0
+                or (ep + 1) % evaluate_every_num_epochs == 0
+                or (ep + 1) == epochs
+            ):
+                # Reset the metrics
+                for metric in model.eval_metrics.values():
+                    metric.reset_states()
+
+                # Eval on batches
+                for batch_in in eval_dataset_func(ep_batch_size):
+                    eval_func(batch_in)
+
+            # Get the metric results
+            postfix_dict.update(
+                {k: v.result().numpy() for k, v in model.eval_metrics.items()}
+            )
 
         pbar.set_postfix(postfix_dict)
 

From 44cfc6156b050bca86154dfa5f5a4dfebea482d3 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 16:54:54 +0100
Subject: [PATCH 082/633] switch train phases

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 6 ++++++
 rasa/utils/train_utils.py                           | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index bdc8323fe4d9..7794ad6ad8d4 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1347,6 +1347,12 @@ def __init__(
             average="micro",
         )
 
+    def set_trainig_phase(self, training: bool):
+        if training:
+            self.training = tf.ones((), tf.bool)
+        else:
+            self.training = tf.zeros((), tf.bool)
+
     def _combine_sparse_dense_features(
         self,
         features: List[Union["tf.Tensor", "tf.SparseTensor"]],
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index a34f42069ac6..b4dd785c6bf3 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -975,6 +975,7 @@ def train_tf_dataset(
             metric.reset_states()
 
         # Train on batches
+        model.set_trainig_phase(True)
         for batch_in in train_dataset_func(ep_batch_size):
             train_func(batch_in)
 
@@ -992,6 +993,7 @@ def train_tf_dataset(
                     metric.reset_states()
 
                 # Eval on batches
+                model.set_trainig_phase(False)
                 for batch_in in eval_dataset_func(ep_batch_size):
                     eval_func(batch_in)
 

From 411d9185bca4abed26dc2d72de459c3f2300f3e9 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 16:55:38 +0100
Subject: [PATCH 083/633] switch train phases

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/utils/train_utils.py                           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7794ad6ad8d4..3b2ae97b66ee 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1347,7 +1347,7 @@ def __init__(
             average="micro",
         )
 
-    def set_trainig_phase(self, training: bool):
+    def set_training_phase(self, training: bool):
         if training:
             self.training = tf.ones((), tf.bool)
         else:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b4dd785c6bf3..73191fce7f78 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -975,7 +975,7 @@ def train_tf_dataset(
             metric.reset_states()
 
         # Train on batches
-        model.set_trainig_phase(True)
+        model.set_training_phase(True)
         for batch_in in train_dataset_func(ep_batch_size):
             train_func(batch_in)
 
@@ -993,7 +993,7 @@ def train_tf_dataset(
                     metric.reset_states()
 
                 # Eval on batches
-                model.set_trainig_phase(False)
+                model.set_training_phase(False)
                 for batch_in in eval_dataset_func(ep_batch_size):
                     eval_func(batch_in)
 

From b8fdcf042c692f74f99fb414698ecc6616d8a695 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 23 Dec 2019 17:42:27 +0100
Subject: [PATCH 084/633] add predict method

---
 .../embedding_intent_classifier.py            | 144 +++++++++---------
 rasa/utils/tf_layers.py                       |   1 +
 rasa/utils/train_utils.py                     |  30 +++-
 3 files changed, 92 insertions(+), 83 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3b2ae97b66ee..9b56b89ec93d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -657,77 +657,6 @@ def _create_session_data(
 
         return session_data
 
-    def _build_tf_pred_graph(self, session_data: "SessionDataType"):
-
-        shapes, types = train_utils.get_shapes_types(session_data)
-
-        batch_placeholder = []
-        for s, t in zip(shapes, types):
-            batch_placeholder.append(tf.placeholder(t, s))
-
-        self.batch_in = tf.tuple(batch_placeholder)
-
-        batch_data, self.batch_tuple_sizes = train_utils.batch_to_session_data(
-            self.batch_in, session_data
-        )
-
-        mask = batch_data["text_mask"][0]
-        a = self.combine_sparse_dense_features(
-            batch_data["text_features"], mask, "text"
-        )
-
-        # transformer
-        a = self._create_tf_sequence(a, mask)
-
-        if self.intent_classification:
-            b = self.combine_sparse_dense_features(
-                batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
-            )
-            self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
-
-            self._pred_intent_graph(a, b, mask)
-
-        if self.named_entity_recognition:
-            self._pred_entity_graph(a, mask)
-
-    def _pred_intent_graph(self, a: "tf.Tensor", b: "tf.Tensor", mask: "tf.Tensor"):
-        last = mask * tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-
-        # get _cls_ embedding
-        self.cls_embed = tf.reduce_sum(a * last, 1)
-        self.cls_embed = train_utils.create_tf_embed(
-            self.cls_embed, self.embed_dim, self.C2, "cls", self.similarity_type
-        )
-
-        b = tf.reduce_sum(b, 1)
-
-        self.sim_all = train_utils.tf_raw_sim(
-            self.cls_embed[:, tf.newaxis, :],
-            self.all_labels_embed[tf.newaxis, :, :],
-            None,
-        )
-        self.label_embed = self._create_tf_embed_fnn(
-            b,
-            self.hidden_layer_sizes["intent"],
-            fnn_name="text_intent" if self.share_hidden_layers else "intent",
-            embed_name="intent",
-        )
-        self.sim = train_utils.tf_raw_sim(
-            self.cls_embed[:, tf.newaxis, :], self.label_embed, None
-        )
-
-        self.intent_prediction = train_utils.confidence_from_sim(
-            self.sim_all, self.similarity_type
-        )
-
-    def _pred_entity_graph(self, a: "tf.Tensor", mask: "tf.Tensor"):
-        mask_up_to_last = 1 - tf.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        sequence_lengths = tf.cast(tf.reduce_sum(mask_up_to_last[:, :, 0], 1), tf.int32)
-
-        # predict tagsx
-        _, _, pred_ids = self._create_crf(a, sequence_lengths)
-        self.entity_prediction = tf.to_int64(pred_ids)
-
     # train helpers
     def preprocess_train_data(self, training_data: "TrainingData"):
         """Prepares data for training.
@@ -991,9 +920,9 @@ def train(
         )
 
         # rebuild the graph for prediction
-        self._build_tf_pred_graph(session_data)
+        self.model.build_for_predict()
 
-        self.attention_weights = train_utils.extract_attention(self.attention_weights)
+        # self.attention_weights = train_utils.extract_attention(self.attention_weights)
 
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
@@ -1224,7 +1153,7 @@ def __init__(
         self.session_data = session_data
         self.eval_session_data = eval_session_data
         label_batch = train_utils.prepare_batch(label_data)
-        self.tf_label_data, _ = train_utils.batch_to_session_data(
+        self.tf_label_data = train_utils.batch_to_session_data(
             label_batch, label_data
         )
 
@@ -1237,6 +1166,7 @@ def __init__(
         self._use_max_sim_neg = use_max_sim_neg
         self._C_emb = C_emb
         self._scale_loss = scale_loss
+        self._similarity_type = similarity_type
         self._masked_lm_loss = masked_lm_loss
         self._intent_classification = intent_classification
         self._named_entity_recognition = named_entity_recognition
@@ -1347,6 +1277,10 @@ def __init__(
             average="micro",
         )
 
+        # persist
+        self.all_labels_embed = None
+        self.batch_tuple_sizes = None
+
     def set_training_phase(self, training: bool):
         if training:
             self.training = tf.ones((), tf.bool)
@@ -1433,7 +1367,7 @@ def _create_sequence(
         features: List[Union["tf.Tensor", "tf.SparseTensor"]],
         mask: "tf.Tensor",
         name: Text,
-        masked_lm_loss: bool,
+        masked_lm_loss: bool = False,
     ):
         x = self._combine_sparse_dense_features(
             features, mask, name, sparse_dropout=self._sparse_input_dropout
@@ -1552,7 +1486,7 @@ def _entity_loss(
         return loss, f1
 
     def _train_losses_scores(self, batch_in):
-        tf_batch_data, _ = train_utils.batch_to_session_data(
+        tf_batch_data = train_utils.batch_to_session_data(
             batch_in, self.session_data
         )
 
@@ -1639,3 +1573,61 @@ def eval_dataset(self, batch_size):
             return train_utils.create_tf_dataset(
                 self.eval_session_data, batch_size, label_key="intent_ids"
             )
+
+    def build_for_predict(self):
+        self.batch_tuple_sizes = train_utils.batch_tuple_sizes(self.session_data)
+
+        all_labels_embed, _ = self._build_all_b()
+        self.all_labels_embed = tf.constant(all_labels_embed.numpy())
+
+    def predict(self, batch_in):
+        tf_batch_data, _ = train_utils.batch_to_session_data(
+            batch_in, self.session_data
+        )
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
+
+        text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, "text"
+        )
+
+        out = {}
+        if self._intent_classification:
+            # get _cls_ vector for intent classification
+            last_index = tf.maximum(
+                tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
+            )
+            idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
+            cls = tf.gather_nd(text_transformed, idxs)
+            cls_embed = self._embed["text"](cls)
+
+            sim_all = train_utils.tf_raw_sim(
+                cls_embed[:, tf.newaxis, :],
+                self.all_labels_embed[tf.newaxis, :, :],
+                None,
+            )
+            label = self._create_bow(
+                tf_batch_data["intent_features"],
+                tf_batch_data["intent_mask"][0],
+                "intent",
+            )
+            label_embed = self._embed["intent"](label)
+            sim = train_utils.tf_raw_sim(
+                cls_embed[:, tf.newaxis, :], label_embed, None
+            )
+
+            scores = train_utils.confidence_from_sim(
+                sim_all, self._similarity_type
+            )
+            out["i_scores"] = scores
+
+        if self.named_entity_recognition:
+            sequence_lengths = sequence_lengths - 1
+            logits = self._embed["logits"](text_transformed)
+            pred_ids, _ = tfa.text.crf.crf_decode(
+                logits, self._crf_params, sequence_lengths
+            )
+            out["e_ids"] = pred_ids
+
+        return out
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 468ca30f3378..18916409aeff 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -141,6 +141,7 @@ def call(self, x):
 # from https://www.tensorflow.org/tutorials/text/transformer
 # and https://github.com/tensorflow/tensor2tensor
 # TODO implement relative attention
+# TODO save attention weights
 class MultiHeadAttention(tf.keras.layers.Layer):
     @staticmethod
     def _scaled_dot_product_attention(q, k, v, pad_mask):
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 73191fce7f78..f6e1e38e3153 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -422,7 +422,7 @@ def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
 
 def batch_to_session_data(
     batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionDataType
-) -> Tuple[Dict[Text, List[tf.Tensor]], Dict[Text, int]]:
+) -> Dict[Text, List[tf.Tensor]]:
     """Convert input batch tensors into batch data format.
 
     Batch contains any number of batch data. The order is equal to the
@@ -432,12 +432,9 @@ def batch_to_session_data(
     """
 
     batch_data = defaultdict(list)
-    # save the amount of placeholders attributed to session data keys
-    tuple_sizes = defaultdict(int)
 
     idx = 0
     for k, values in session_data.items():
-        tuple_sizes[k] = 0
         for v in values:
             if isinstance(v[0], scipy.sparse.spmatrix):
                 # explicitly substitute last dimension in shape with known static value
@@ -448,14 +445,33 @@ def batch_to_session_data(
                         [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
                     )
                 )
-                tuple_sizes[k] += 3
                 idx += 3
             else:
                 batch_data[k].append(batch[idx])
+                idx += 1
+
+    return batch_data
+
+
+def batch_tuple_sizes(
+    session_data: SessionDataType
+) -> Dict[Text, int]:
+
+    # save the amount of placeholders attributed to session data keys
+    tuple_sizes = defaultdict(int)
+
+    idx = 0
+    for k, values in session_data.items():
+        tuple_sizes[k] = 0
+        for v in values:
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                tuple_sizes[k] += 3
+                idx += 3
+            else:
                 tuple_sizes[k] += 1
                 idx += 1
 
-    return batch_data, tuple_sizes
+    return tuple_sizes
 
 
 def create_tf_dataset(
@@ -943,7 +959,7 @@ def train_tf_dataset(
 ) -> None:
     """Train tf graph"""
 
-    if evaluate_on_num_examples:
+    if evaluate_on_num_examples > 0:
         logger.info(
             f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
             f"epochs."

From 578cfdb7eb2935a0fd74aa3d821200501a523485 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 13 Jan 2020 15:33:53 +0100
Subject: [PATCH 085/633] create crf and input mask layers

---
 .../embedding_intent_classifier.py            | 83 +++--------------
 rasa/utils/tf_layers.py                       | 89 ++++++++++++++++++-
 2 files changed, 97 insertions(+), 75 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9b56b89ec93d..9ea146ffcb13 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1219,6 +1219,7 @@ def __init__(
         self.train_metrics = {"t_loss": tf.keras.metrics.Mean(name="t_loss")}
         self.eval_metrics = {"val_t_loss": tf.keras.metrics.Mean(name="val_t_loss")}
         if self._masked_lm_loss:
+            self._input_mask = tf_layers.InputMask()
             self._embed["text_mask"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "text_mask", similarity_type
             )
@@ -1229,6 +1230,8 @@ def __init__(
             self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
             self.eval_metrics["val_m_loss"] = tf.keras.metrics.Mean(name="val_m_loss")
             self.eval_metrics["val_m_acc"] = tf.keras.metrics.Mean(name="val_m_acc")
+        else:
+            self._input_mask = None
 
         if self._intent_classification:
             self._embed["text"] = tf_layers.Embed(
@@ -1246,29 +1249,16 @@ def __init__(
             self._embed["logits"] = tf_layers.Embed(
                 self._num_tags, reg_lambda, "logits"
             )
+            self._crf = tf_layers.CRF(self._num_tags)
             self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
             self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
             self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
             self.eval_metrics["val_e_f1"] = tf.keras.metrics.Mean(name="val_e_f1")
+        else:
+            self._crf = None
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
-        initializer = tf.keras.initializers.GlorotUniform()
-        text_input_dim = self._input_dim(session_data["text_features"], dense_dim)
-        self._mask_vector = self.add_weight(
-            shape=(1, 1, text_input_dim),
-            initializer=initializer,
-            trainable=True,
-            name="mask_vector",
-        )
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self._crf_params = self.add_weight(
-            shape=(self._num_tags, self._num_tags),
-            initializer=initializer,
-            regularizer=l2_regularizer,
-            trainable=True,
-            name="crf_params",
-        )
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(learning_rate)
@@ -1320,48 +1310,6 @@ def _create_bow(
         x = self._combine_sparse_dense_features(features, mask, name)
         return self._ffnn[name](tf.reduce_sum(x, 1), self.training)
 
-    def _mask_input(
-        self, a: "tf.Tensor", mask: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        """Randomly mask input sequences."""
-
-        # do not substitute with cls token
-        pad_mask_up_to_last = tf.math.cumprod(
-            1 - mask, axis=1, exclusive=True, reverse=True
-        )
-        mask_up_to_last = 1 - pad_mask_up_to_last
-
-        a_random_pad = (
-            tf.random.uniform(tf.shape(a), tf.reduce_min(a), tf.reduce_max(a), a.dtype)
-            * pad_mask_up_to_last
-        )
-        # shuffle over batch dim
-        a_shuffle = tf.random.shuffle(a * mask_up_to_last + a_random_pad)
-
-        # shuffle over sequence dim
-        a_shuffle = tf.transpose(a_shuffle, [1, 0, 2])
-        a_shuffle = tf.random.shuffle(a_shuffle)
-        a_shuffle = tf.transpose(a_shuffle, [1, 0, 2])
-
-        # shuffle doesn't support backprop
-        a_shuffle = tf.stop_gradient(a_shuffle)
-
-        a_mask = tf.tile(self._mask_vector, (tf.shape(a)[0], tf.shape(a)[1], 1))
-
-        other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
-        other_prob = tf.tile(other_prob, (1, 1, a.shape[-1]))
-        a_other = tf.where(
-            other_prob < 0.70, a_mask, tf.where(other_prob < 0.80, a_shuffle, a)
-        )
-
-        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
-        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
-        a_pre = tf.where(tf.tile(lm_mask_bool, (1, 1, a.shape[-1])), a_other, a)
-
-        a_pre = tf.cond(self.training, lambda: a_pre, lambda: a)
-
-        return a_pre, lm_mask_bool
-
     def _create_sequence(
         self,
         features: List[Union["tf.Tensor", "tf.SparseTensor"]],
@@ -1374,7 +1322,7 @@ def _create_sequence(
         )
 
         if masked_lm_loss:
-            pre, lm_mask_bool = self._mask_input(x, mask)
+            pre, lm_mask_bool = self._input_mask(x, mask, self.training)
         else:
             pre, lm_mask_bool = (x, None)
 
@@ -1463,15 +1411,10 @@ def _entity_loss(
         # c: (batch-size, max-seq-len)
 
         # CRF Loss
-        log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
-            logits, c, sequence_lengths, self._crf_params
-        )
-        loss = tf.reduce_mean(-log_likelihood)
+        loss = self._crf.loss(logits, c, sequence_lengths)
 
         # CRF preds
-        pred_ids, _ = tfa.text.crf.crf_decode(
-            logits, self._crf_params, sequence_lengths
-        )
+        pred_ids = self._crf(logits, sequence_lengths)
 
         # calculate f1 score for train predictions
         mask_bool = tf.cast(mask[:, :, 0], tf.bool)
@@ -1581,14 +1524,14 @@ def build_for_predict(self):
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
     def predict(self, batch_in):
-        tf_batch_data, _ = train_utils.batch_to_session_data(
+        tf_batch_data = train_utils.batch_to_session_data(
             batch_in, self.session_data
         )
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
 
-        text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
+        text_transformed, _, _ = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text"
         )
 
@@ -1625,9 +1568,7 @@ def predict(self, batch_in):
         if self.named_entity_recognition:
             sequence_lengths = sequence_lengths - 1
             logits = self._embed["logits"](text_transformed)
-            pred_ids, _ = tfa.text.crf.crf_decode(
-                logits, self._crf_params, sequence_lengths
-            )
+            pred_ids = self._crf(logits, sequence_lengths)
             out["e_ids"] = pred_ids
 
         return out
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 18916409aeff..9f76acf74830 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -13,7 +13,9 @@
     NamedTuple,
 )
 import tensorflow as tf
+import tensorflow_addons as tfa
 import numpy as np
+from rasa.utils import train_utils
 
 if typing.TYPE_CHECKING:
     from tensor2tensor.utils.hparam import HParams
@@ -84,9 +86,9 @@ def __init__(
         super(ReluFfn, self).__init__(name=f"ffnn_{layer_name_suffix}")
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self._layers = []
+        self._ffn_layers = []
         for i, layer_size in enumerate(layer_sizes):
-            self._layers.append(
+            self._ffn_layers.append(
                 tf.keras.layers.Dense(
                     units=layer_size,
                     activation="relu",
@@ -94,10 +96,10 @@ def __init__(
                     name=f"hidden_layer_{layer_name_suffix}_{i}",
                 )
             )
-            self._layers.append(tf.keras.layers.Dropout(rate=droprate))
+            self._ffn_layers.append(tf.keras.layers.Dropout(rate=droprate))
 
     def call(self, x, training):
-        for layer in self._layers:
+        for layer in self._ffn_layers:
             x = layer(x, training=training)
 
         return x
@@ -373,3 +375,82 @@ def call(self, x, pad_mask, training):
         # on the output, since the output can grow very large, being the sum of
         # a whole stack of unnormalized layer outputs.
         return self._layernorm(x)  # (batch_size, seq_len, d_model)
+
+
+class InputMask(tf.keras.layers.Layer):
+
+    def build(self, input_shape):
+        initializer = tf.keras.initializers.GlorotUniform()
+        self.mask_vector = self.add_weight(
+            shape=(1, 1, input_shape[-1]),
+            initializer=initializer,
+            trainable=True,
+            name="mask_vector",
+        )
+        self.built = True
+
+    def call(self, x, mask, training):
+        """Randomly mask input sequences."""
+
+        # do not substitute with cls token
+        pad_mask_up_to_last = tf.math.cumprod(
+            1 - mask, axis=1, exclusive=True, reverse=True
+        )
+        mask_up_to_last = 1 - pad_mask_up_to_last
+
+        x_random_pad = (
+            tf.random.uniform(tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype)
+            * pad_mask_up_to_last
+        )
+        # shuffle over batch dim
+        x_shuffle = tf.random.shuffle(x * mask_up_to_last + x_random_pad)
+
+        # shuffle over sequence dim
+        x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
+        x_shuffle = tf.random.shuffle(x_shuffle)
+        x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
+
+        # shuffle doesn't support backprop
+        x_shuffle = tf.stop_gradient(x_shuffle)
+
+        mask_vector = tf.tile(self.mask_vector, (tf.shape(x)[0], tf.shape(x)[1], 1))
+
+        other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
+        other_prob = tf.tile(other_prob, (1, 1, x.shape[-1]))
+        x_other = tf.where(
+            other_prob < 0.70, mask_vector, tf.where(other_prob < 0.80, x_shuffle, x)
+        )
+
+        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
+        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
+        x_masked = tf.where(tf.tile(lm_mask_bool, (1, 1, x.shape[-1])), x_other, x)
+
+        x_masked = tf.cond(training, lambda: x_masked, lambda: x)
+
+        return x_masked, lm_mask_bool
+
+
+class CRF(tf.keras.layers.Layer):
+
+    def __init__(self, num_tags, name=None):
+        super().__init__(name=name)
+
+        initializer = tf.keras.initializers.GlorotUniform()
+        self.transition_params = self.add_weight(
+            shape=(num_tags, num_tags),
+            initializer=initializer,
+            trainable=True,
+            name="transitions",
+        )
+
+    def call(self, logits, sequence_lengths):
+        pred_ids, _ = tfa.text.crf.crf_decode(
+            logits, self.transition_params, sequence_lengths
+        )
+        return pred_ids
+
+    def loss(self, logits, tag_indices, sequence_lengths):
+        log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
+            logits, tag_indices, sequence_lengths, self.transition_params
+        )
+        return tf.reduce_mean(-log_likelihood)

From 356810a2a9474030ef62b583356071ebb17d2215 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 13 Jan 2020 15:54:13 +0100
Subject: [PATCH 086/633] add entity featurizer

---
 .../sparse_featurizer/entity_featurizer.py    | 281 ++++++++++++++++++
 1 file changed, 281 insertions(+)
 create mode 100644 rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py

diff --git a/rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py
new file mode 100644
index 000000000000..decad2738013
--- /dev/null
+++ b/rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py
@@ -0,0 +1,281 @@
+import logging
+from collections import defaultdict
+
+import numpy as np
+import os
+import pickle
+import typing
+import scipy.sparse
+from typing import Any, Dict, Optional, Text, List
+
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    TEXT_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
+    SPACY_DOCS,
+)
+
+logger = logging.getLogger(__name__)
+
+if typing.TYPE_CHECKING:
+    from rasa.nlu.model import Metadata
+
+try:
+    import spacy
+except ImportError:
+    spacy = None
+
+
+class CRFToken(typing.NamedTuple):
+    text: Text
+    pos_tag: Text
+    pattern: Dict[Text, Any]
+
+
+class EntityFeaturizer(Featurizer):
+
+    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
+
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+
+    defaults = {
+        # crf_features is [before, word, after] array with before, word,
+        # after holding keys about which
+        # features to use for each word, for example, 'title' in
+        # array before will have the feature
+        # "is the preceding word in title case?"
+        # POS features require spaCy to be installed
+        "features": [
+            ["low", "title", "upper"],
+            [
+                "bias",
+                "low",
+                "prefix5",
+                "prefix2",
+                "suffix5",
+                "suffix3",
+                "suffix2",
+                "upper",
+                "title",
+                "digit",
+                "pattern",
+            ],
+            ["low", "title", "upper"],
+        ]
+    }
+
+    function_dict = {
+        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
+        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
+        "prefix5": lambda crf_token: crf_token.text[:5],
+        "prefix2": lambda crf_token: crf_token.text[:2],
+        "suffix5": lambda crf_token: crf_token.text[-5:],
+        "suffix3": lambda crf_token: crf_token.text[-3:],
+        "suffix2": lambda crf_token: crf_token.text[-2:],
+        "suffix1": lambda crf_token: crf_token.text[-1:],
+        "pos": lambda crf_token: crf_token.pos_tag,
+        "pos2": lambda crf_token: crf_token.pos_tag[:2],
+        "bias": lambda crf_token: "bias",
+        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
+        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
+        "pattern": lambda crf_token: crf_token.pattern,
+    }
+
+    def __init__(
+        self,
+        component_config: Dict[Text, Any],
+        feature_id_dict: Optional[Dict[Text, Dict[Text, int]]] = None,
+    ):
+        super().__init__(component_config)
+
+        self.feature_id_dict = feature_id_dict
+        self._check_pos_features_and_spacy()
+
+    def train(
+        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+    ) -> None:
+        self.feature_id_dict = self._create_feature_id_dict(training_data)
+
+        for example in training_data.training_examples:
+            self._text_features_for_entities(example)
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        self._text_features_for_entities(message)
+
+    def _text_features_for_entities(self, message: Message) -> None:
+        tokens = self._from_text_to_crf(message)
+        features = self._sentence_to_features(tokens)
+
+        num_features = sum(
+            [
+                len(feature_vals.values())
+                for feature_vals in self.feature_id_dict.values()
+            ]
+        )
+
+        vec = np.zeros([len(tokens), num_features])
+
+        # convert features into one-hot
+        for token_idx, token in enumerate(features):
+            for k, v in token.items():
+                if k in self.feature_id_dict and str(v) in self.feature_id_dict[k]:
+                    feature_idx = self.feature_id_dict[k][str(v)]
+                    vec[token_idx][feature_idx] = 1
+
+        entity_features = scipy.sparse.coo_matrix(vec)
+
+        # set features
+        features = self._combine_with_existing_sparse_features(
+            message, entity_features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+        )
+        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], features)
+
+    def _create_feature_id_dict(
+        self, training_data: TrainingData
+    ) -> Dict[Text, Dict[Text, int]]:
+        features = []
+        for example in training_data.training_examples:
+            tokens = self._from_text_to_crf(example)
+            features.append(self._sentence_to_features(tokens))
+
+        # build vocab of features
+        vocab_x = defaultdict(set)
+        for sent_features in features:
+            for token_features in sent_features:
+                for key, val in token_features.items():
+                    vocab_x[key].add(val)
+
+        feature_id_dict = {}
+        offset = 0
+        for key, val in vocab_x.items():
+            feature_id_dict[key] = {
+                str(feature_val): idx
+                for idx, feature_val in enumerate(sorted(val), offset)
+            }
+            offset += len(val)
+
+        return feature_id_dict
+
+    def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any]]:
+        """Convert a word into discrete features in self.crf_features,
+        including word before and word after."""
+
+        configured_features = self.component_config["features"]
+        sentence_features = []
+
+        for word_idx in range(len(sentence)):
+            # word before(-1), current word(0), next word(+1)
+            feature_span = len(configured_features)
+            half_span = feature_span // 2
+            feature_range = range(-half_span, half_span + 1)
+            prefixes = [str(i) for i in feature_range]
+            word_features = {}
+            for f_i in feature_range:
+                if word_idx + f_i >= len(sentence):
+                    word_features["EOS"] = True
+                    # End Of Sentence
+                elif word_idx + f_i < 0:
+                    word_features["BOS"] = True
+                    # Beginning Of Sentence
+                else:
+                    word = sentence[word_idx + f_i]
+                    f_i_from_zero = f_i + half_span
+                    prefix = prefixes[f_i_from_zero]
+                    features = configured_features[f_i_from_zero]
+                    for feature in features:
+                        if feature == "pattern":
+                            # add all regexes as a feature
+                            regex_patterns = self.function_dict[feature](word)
+                            # pytype: disable=attribute-error
+                            for p_name, matched in regex_patterns.items():
+                                feature_name = prefix + ":" + feature + ":" + p_name
+                                word_features[feature_name] = matched
+                            # pytype: enable=attribute-error
+                        else:
+                            # append each feature to a feature vector
+                            value = self.function_dict[feature](word)
+                            word_features[prefix + ":" + feature] = value
+            sentence_features.append(word_features)
+        return sentence_features
+
+    def _from_text_to_crf(self, message: Message) -> List[CRFToken]:
+        """Takes a sentence and switches it to crfsuite format."""
+
+        crf_format = []
+        if self.pos_features:
+            tokens = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+        else:
+            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
+
+        for i, token in enumerate(tokens):
+            pattern = self.__pattern_of_token(message, i)
+            pos_tag = self.__tag_of_token(token) if self.pos_features else None
+
+            crf_format.append(CRFToken(token.text, pos_tag, pattern))
+
+        return crf_format
+
+    @staticmethod
+    def __pattern_of_token(message, i):
+        if message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
+            return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get("pattern", {})
+        else:
+            return {}
+
+    @staticmethod
+    def __tag_of_token(token):
+        if spacy.about.__version__ > "2" and token._.has("tag"):
+            return token._.get("tag")
+        else:
+            return token.tag_
+
+    def _check_pos_features_and_spacy(self):
+        import itertools
+
+        features = self.component_config.get("features", [])
+        fts = set(itertools.chain.from_iterable(features))
+        self.pos_features = "pos" in fts or "pos2" in fts
+        if self.pos_features:
+            self._check_spacy()
+
+    @staticmethod
+    def _check_spacy():
+        if spacy is None:
+            raise ImportError(
+                "Failed to import `spaCy`. "
+                "`spaCy` is required for POS features "
+                "See https://spacy.io/usage/ for installation"
+                "instructions."
+            )
+
+    @classmethod
+    def load(
+        cls,
+        meta: Dict[Text, Any],
+        model_dir: Optional[Text] = None,
+        model_metadata: Optional["Metadata"] = None,
+        cached_component: Optional["EntityFeaturizer"] = None,
+        **kwargs: Any,
+    ) -> "EntityFeaturizer":
+
+        file_name = meta.get("file")
+
+        with open(
+            os.path.join(model_dir, file_name + ".feature_id_dict.pkl"), "rb"
+        ) as f:
+            feature_id_dict = pickle.load(f)
+
+        return EntityFeaturizer(meta, feature_id_dict=feature_id_dict)
+
+    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
+        """Persist this model into the passed directory.
+        Return the metadata necessary to load the model again."""
+        with open(
+            os.path.join(model_dir, file_name + ".feature_id_dict.pkl"), "wb"
+        ) as f:
+            pickle.dump(self.feature_id_dict, f)
+
+        return {"file": file_name}

From 9fdd9797fa3e19bff16566f33119ba9f97ac98fd Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 14 Jan 2020 15:58:03 +0100
Subject: [PATCH 087/633] save load predict

---
 .../embedding_intent_classifier.py            | 482 +++++++++---------
 rasa/utils/tf_layers.py                       |  14 +-
 2 files changed, 256 insertions(+), 240 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9ea146ffcb13..7692c98fcffc 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -189,8 +189,8 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
                 "hidden_layer_sizes for a and b must coincide"
             )
 
-        self.batch_in_size = config["batch_size"]
-        self.batch_in_strategy = config["batch_strategy"]
+        self.batch_size = config["batch_size"]
+        self.batch_strategy = config["batch_strategy"]
 
         self.optimizer = config["optimizer"]
         self.normalize_loss = config["normalize_loss"]
@@ -262,16 +262,8 @@ def __init__(
         component_config: Optional[Dict[Text, Any]] = None,
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        session: Optional["tf.Session"] = None,
-        graph: Optional["tf.Graph"] = None,
-        batch_placeholder: Optional["tf.Tensor"] = None,
-        similarity_all: Optional["tf.Tensor"] = None,
-        intent_prediction: Optional["tf.Tensor"] = None,
-        entity_prediction: Optional["tf.Tensor"] = None,
-        similarity: Optional["tf.Tensor"] = None,
-        cls_embed: Optional["tf.Tensor"] = None,
-        label_embed: Optional["tf.Tensor"] = None,
-        all_labels_embed: Optional["tf.Tensor"] = None,
+        model=None,
+        predict_func=None,
         batch_tuple_sizes: Optional[Dict] = None,
         attention_weights: Optional["tf.Tensor"] = None,
     ) -> None:
@@ -284,22 +276,12 @@ def __init__(
         # transform numbers to labels
         self.inverted_label_dict = inverted_label_dict
         self.inverted_tag_dict = inverted_tag_dict
-        # encode all label_ids with numbers
-        self._label_data = None
 
-        # tf related instances
-        self.session = session
-        self.graph = graph
-        self.batch_in = batch_placeholder
-        self.sim_all = similarity_all
-        self.intent_prediction = intent_prediction
-        self.entity_prediction = entity_prediction
-        self.sim = similarity
+        self.model = model
+        self.predict_func = predict_func
 
-        # persisted embeddings
-        self.cls_embed = cls_embed
-        self.label_embed = label_embed
-        self.all_labels_embed = all_labels_embed
+        # encode all label_ids with numbers
+        self._label_data = None
 
         # keep the input tuple sizes in self.batch_in
         self.batch_tuple_sizes = batch_tuple_sizes
@@ -460,11 +442,17 @@ def _add_to_session_data(
             if data.size > 0:
                 session_data[key].append(data)
 
+        if not session_data[key]:
+            del session_data[key]
+
     @staticmethod
     def _add_mask_to_session_data(
         session_data: SessionDataType, key: Text, from_key: Text
     ):
 
+        if not session_data.get(from_key):
+            return
+
         session_data[key] = []
 
         for data in session_data[from_key]:
@@ -708,137 +696,6 @@ def apply_bilou_schema(self, training_data: "TrainingData"):
 
             example.set(MESSAGE_BILOU_ENTITIES_ATTRIBUTE, output)
 
-    # process helpers
-    def predict_label(
-        self, message: "Message"
-    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
-
-        label = {"name": None, "confidence": 0.0}
-        label_ranking = []
-
-        if self.session is None:
-            logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
-                "didn't receive enough training data"
-            )
-            return label, label_ranking
-
-        # create session data from message and convert it into a batch of 1
-        session_data = self._create_session_data([message])
-        batch = train_utils.prepare_batch(
-            session_data, tuple_sizes=self.batch_tuple_sizes
-        )
-
-        # load tf graph and session
-        label_ids, message_sim = self._calculate_message_sim(batch)
-
-        # if X contains all zeros do not predict some label
-        if label_ids.size > 0:
-            label = {
-                "name": self.inverted_label_dict[label_ids[0]],
-                "confidence": message_sim[0],
-            }
-
-            ranking = list(zip(list(label_ids), message_sim))
-            ranking = ranking[:LABEL_RANKING_LENGTH]
-            label_ranking = [
-                {"name": self.inverted_label_dict[label_idx], "confidence": score}
-                for label_idx, score in ranking
-            ]
-
-        return label, label_ranking
-
-    def _calculate_message_sim(
-        self, batch: Tuple[np.ndarray]
-    ) -> Tuple[np.ndarray, List[float]]:
-        """Calculate message similarities"""
-
-        message_sim = self.session.run(
-            self.intent_prediction,
-            feed_dict={
-                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
-            },
-        )
-
-        message_sim = message_sim.flatten()  # sim is a matrix
-
-        label_ids = message_sim.argsort()[::-1]
-        message_sim[::-1].sort()
-
-        # transform sim to python list for JSON serializing
-        return label_ids, message_sim.tolist()
-
-    def predict_entities(self, message: "Message") -> List[Dict]:
-        if self.session is None:
-            logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
-                "didn't receive enough training data"
-            )
-            return []
-
-        # create session data from message and convert it into a batch of 1
-        self.num_tags = len(self.inverted_tag_dict)
-        session_data = self._create_session_data([message])
-        batch = train_utils.prepare_batch(
-            session_data, tuple_sizes=self.batch_tuple_sizes
-        )
-
-        # load tf graph and session
-        predictions = self.session.run(
-            self.entity_prediction,
-            feed_dict={
-                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
-            },
-        )
-
-        tags = [self.inverted_tag_dict[p] for p in predictions[0]]
-
-        if self.bilou_flag:
-            tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
-
-        entities = self._convert_tags_to_entities(
-            message.text, message.get("tokens", []), tags
-        )
-
-        extracted = self.add_extractor_name(entities)
-        entities = message.get("entities", []) + extracted
-
-        return entities
-
-    def _convert_tags_to_entities(
-        self, text: str, tokens: List[Token], tags: List[Text]
-    ) -> List[Dict[Text, Any]]:
-        entities = []
-        last_tag = "O"
-        for token, tag in zip(tokens, tags):
-            if tag == "O":
-                last_tag = tag
-                continue
-
-            # new tag found
-            if last_tag != tag:
-                entity = {
-                    "entity": tag,
-                    "start": token.offset,
-                    "end": token.end,
-                    "extractor": "flair",
-                }
-                entities.append(entity)
-
-            # belongs to last entity
-            elif last_tag == tag:
-                entities[-1]["end"] = token.end
-
-            last_tag = tag
-
-        for entity in entities:
-            entity["value"] = text[entity["start"] : entity["end"]]
-
-        return entities
-
-    # methods to overwrite
     def train(
         self,
         training_data: "TrainingData",
@@ -907,34 +764,146 @@ def train(
             self.named_entity_recognition,
             self.inverted_tag_dict,
             self.learning_rate,
-            self.batch_in_strategy,
+            self.batch_strategy,
         )
 
         train_utils.train_tf_dataset(
             self.model,
             self.epochs,
-            self.batch_in_size,
+            self.batch_size,
             self.evaluate_on_num_examples,
             self.evaluate_every_num_epochs,
             output_file=self.training_log_file,
         )
 
         # rebuild the graph for prediction
-        self.model.build_for_predict()
+        # self.model.build_for_predict()
 
         # self.attention_weights = train_utils.extract_attention(self.attention_weights)
 
+        # process helpers
+
+    def _predict(self, message: "Message"):
+        if self.model is None or self.predict_func is None:
+            return
+
+        # create session data from message and convert it into a batch of 1
+        session_data = self._create_session_data([message])
+        self.model.session_data = session_data
+        predict_dataset = self.model.predict_dataset()
+        batch_in = next(iter(predict_dataset))
+
+        return self.predict_func(batch_in)
+
+    def _predict_label(
+            self, out
+    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
+
+        label = {"name": None, "confidence": 0.0}
+        label_ranking = []
+
+        if self.model is None:
+            logger.error(
+                "There is no trained tf.session: "
+                "component is either not trained or "
+                "didn't receive enough training data"
+            )
+            return label, label_ranking
+
+        message_sim = out["i_scores"].numpy()
+
+        message_sim = message_sim.flatten()  # sim is a matrix
+
+        label_ids = message_sim.argsort()[::-1]
+        message_sim[::-1].sort()
+        message_sim = message_sim.tolist()
+
+        # if X contains all zeros do not predict some label
+        if label_ids.size > 0:
+            label = {
+                "name": self.inverted_label_dict[label_ids[0]],
+                "confidence": message_sim[0],
+            }
+
+            ranking = list(zip(list(label_ids), message_sim))
+            ranking = ranking[:LABEL_RANKING_LENGTH]
+            label_ranking = [
+                {"name": self.inverted_label_dict[label_idx], "confidence": score}
+                for label_idx, score in ranking
+            ]
+
+        return label, label_ranking
+
+    def _predict_entities(self, out, message: "Message") -> List[Dict]:
+        if self.model is None:
+            logger.error(
+                "There is no trained tf.session: "
+                "component is either not trained or "
+                "didn't receive enough training data"
+            )
+            return []
+
+        # load tf graph and session
+        predictions = out["e_ids"].numpy()
+
+        tags = [self.inverted_tag_dict[p] for p in predictions[0]]
+
+        if self.bilou_flag:
+            tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
+
+        entities = self._convert_tags_to_entities(
+            message.text, message.get("tokens", []), tags
+        )
+
+        extracted = self.add_extractor_name(entities)
+        entities = message.get("entities", []) + extracted
+
+        return entities
+
+    def _convert_tags_to_entities(
+            self, text: str, tokens: List[Token], tags: List[Text]
+    ) -> List[Dict[Text, Any]]:
+        entities = []
+        last_tag = "O"
+        for token, tag in zip(tokens, tags):
+            if tag == "O":
+                last_tag = tag
+                continue
+
+            # new tag found
+            if last_tag != tag:
+                entity = {
+                    "entity": tag,
+                    "start": token.offset,
+                    "end": token.end,
+                    "extractor": "flair",
+                }
+                entities.append(entity)
+
+            # belongs to last entity
+            elif last_tag == tag:
+                entities[-1]["end"] = token.end
+
+            last_tag = tag
+
+        for entity in entities:
+            entity["value"] = text[entity["start"]: entity["end"]]
+
+        return entities
+
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
+        out = self._predict(message)
+
         if self.intent_classification:
-            label, label_ranking = self.predict_label(message)
+            label, label_ranking = self._predict_label(out)
 
             message.set("intent", label, add_to_output=True)
             message.set("intent_ranking", label_ranking, add_to_output=True)
 
         if self.named_entity_recognition:
-            entities = self.predict_entities(message)
+            entities = self._predict_entities(out, message)
 
             message.set("entities", entities, add_to_output=True)
 
@@ -944,49 +913,42 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         Return the metadata necessary to load the model again.
         """
 
-        if self.session is None:
+        if self.model is None:
             return {"file": None}
 
-        checkpoint = os.path.join(model_dir, file_name + ".ckpt")
+        model_file = os.path.join(model_dir, file_name +".tf_model")
 
-        # plot training curves
-        plotter = Plotter()
-        plotter.plot_training_curves(self.training_log_file, model_dir)
-        # copy trainig log file
-        copyfile(self.training_log_file, os.path.join(model_dir, "training-log.tsv"))
+        # # plot training curves
+        # plotter = Plotter()
+        # plotter.plot_training_curves(self.training_log_file, model_dir)
+        # # copy trainig log file
+        # copyfile(self.training_log_file, os.path.join(model_dir, "training-log.tsv"))
 
         try:
-            os.makedirs(os.path.dirname(checkpoint))
+            os.makedirs(os.path.dirname(model_file))
         except OSError as e:
             # be happy if someone already created the path
             import errno
 
             if e.errno != errno.EEXIST:
                 raise
-        with self.graph.as_default():
-            train_utils.persist_tensor("batch_placeholder", self.batch_in, self.graph)
 
-            train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
-            train_utils.persist_tensor(
-                "intent_prediction", self.intent_prediction, self.graph
-            )
-            train_utils.persist_tensor(
-                "entity_prediction", self.entity_prediction, self.graph
-            )
-            train_utils.persist_tensor("similarity", self.sim, self.graph)
+        self.model.save_weights(model_file, save_format='tf')
 
-            train_utils.persist_tensor("cls_embed", self.cls_embed, self.graph)
-            train_utils.persist_tensor("label_embed", self.label_embed, self.graph)
-            train_utils.persist_tensor(
-                "all_labels_embed", self.all_labels_embed, self.graph
-            )
+        dummy_session_data = {
+            k: [v[:1] for v in vs]
+            for k, vs in self.model.session_data.items()
+        }
 
-            train_utils.persist_tensor(
-                "attention_weights", self.attention_weights, self.graph
-            )
+        with open(
+            os.path.join(model_dir, file_name + ".dummy_session_data.pkl"), "wb"
+        ) as f:
+            pickle.dump(dummy_session_data, f)
 
-            saver = tf.train.Saver()
-            saver.save(self.session, checkpoint)
+        with open(
+            os.path.join(model_dir, file_name + ".label_data.pkl"), "wb"
+        ) as f:
+            pickle.dump(self._label_data, f)
 
         with open(
             os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "wb"
@@ -1018,31 +980,20 @@ def load(
 
         if model_dir and meta.get("file"):
             file_name = meta.get("file")
-            checkpoint = os.path.join(model_dir, file_name + ".ckpt")
+            model_file = os.path.join(model_dir, file_name + ".tf_model")
 
             with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
                 _tf_config = pickle.load(f)
 
-            graph = tf.Graph()
-            with graph.as_default():
-                session = tf.compat.v1.Session(config=_tf_config)
-                saver = tf.compat.v1.train.import_meta_graph(checkpoint + ".meta")
-
-                saver.restore(session, checkpoint)
-
-                batch_in = train_utils.load_tensor("batch_placeholder")
-
-                sim_all = train_utils.load_tensor("similarity_all")
-                cls_embed = train_utils.load_tensor("cls_embed")
-                intent_prediction = train_utils.load_tensor("intent_prediction")
-                entity_prediction = train_utils.load_tensor("entity_prediction")
-                sim = train_utils.load_tensor("similarity")
-
-                message_embed = train_utils.load_tensor("message_embed")
-                label_embed = train_utils.load_tensor("label_embed")
-                all_labels_embed = train_utils.load_tensor("all_labels_embed")
+            with open(
+                os.path.join(model_dir, file_name + ".dummy_session_data.pkl"), "rb"
+            ) as f:
+                dummy_session_data = pickle.load(f)
 
-                attention_weights = train_utils.load_tensor("attention_weights")
+            with open(
+                os.path.join(model_dir, file_name + ".label_data.pkl"), "rb"
+            ) as f:
+                label_data = pickle.load(f)
 
             with open(
                 os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "rb"
@@ -1059,21 +1010,79 @@ def load(
             ) as f:
                 batch_tuple_sizes = pickle.load(f)
 
+            hidden_layer_sizes = {
+                "text": meta["hidden_layers_sizes_a"],
+                "intent": meta["hidden_layers_sizes_b"],
+                "tag": meta["hidden_layers_sizes_c"],
+            }
+            similarity_type = meta["similarity_type"]
+            if similarity_type == "auto":
+                if meta["loss_type"] == "softmax":
+                    similarity_type = "inner"
+                elif meta["loss_type"] == "margin":
+                    similarity_type = "cosine"
+
+            model = DIET(
+                dummy_session_data,
+                None,
+                label_data,
+                meta["dense_dim"],
+                meta["embed_dim"],
+                hidden_layer_sizes,
+                meta["share_hidden_layers"],
+                meta["num_transformer_layers"],
+                meta["transformer_size"],
+                meta["num_heads"],
+                meta["max_seq_length"],
+                meta["unidirectional_encoder"],
+                meta["C2"],
+                meta["droprate"],
+                meta["sparse_input_dropout"],
+                meta["num_neg"],
+                meta["loss_type"],
+                meta["mu_pos"],
+                meta["mu_neg"],
+                meta["use_max_sim_neg"],
+                meta["C_emb"],
+                meta["scale_loss"],
+                similarity_type,
+                meta["masked_lm_loss"],
+                meta["intent_classification"],
+                meta["named_entity_recognition"],
+                inv_tag_dict,
+                meta["learning_rate"],
+                meta["batch_strategy"],
+            )
+
+            train_utils.train_tf_dataset(
+                model,
+                1,
+                1,
+                0,
+                0,
+            )
+
+            model.load_weights(model_file)
+
+            # build the graph for prediction
+            model.session_data = {
+                k: vs
+                for k, vs in model.session_data.items()
+                if "text" in k
+            }
+            model.build_for_predict()
+            predict_dataset = model.predict_dataset()
+            predict_func = tf.function(
+                model.predict, input_signature=[predict_dataset.element_spec]
+            )
+            batch_in = next(iter(predict_dataset))
+            predict_func(batch_in)
             return cls(
                 component_config=meta,
                 inverted_label_dict=inv_label_dict,
                 inverted_tag_dict=inv_tag_dict,
-                session=session,
-                graph=graph,
-                batch_placeholder=batch_in,
-                similarity_all=sim_all,
-                intent_prediction=intent_prediction,
-                entity_prediction=entity_prediction,
-                similarity=sim,
-                cls_embed=cls_embed,
-                label_embed=label_embed,
-                all_labels_embed=all_labels_embed,
-                attention_weights=attention_weights,
+                model=model,
+                predict_func=predict_func,
                 batch_tuple_sizes=batch_tuple_sizes,
             )
 
@@ -1085,7 +1094,7 @@ def load(
             return cls(component_config=meta)
 
 
-class DIET(tf.keras.layers.Layer):
+class DIET(tf.keras.models.Model):
     @staticmethod
     def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
 
@@ -1249,7 +1258,7 @@ def __init__(
             self._embed["logits"] = tf_layers.Embed(
                 self._num_tags, reg_lambda, "logits"
             )
-            self._crf = tf_layers.CRF(self._num_tags)
+            self._crf = tf_layers.CRF(self._num_tags, reg_lambda)
             self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
             self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
             self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
@@ -1550,25 +1559,32 @@ def predict(self, batch_in):
                 self.all_labels_embed[tf.newaxis, :, :],
                 None,
             )
-            label = self._create_bow(
-                tf_batch_data["intent_features"],
-                tf_batch_data["intent_mask"][0],
-                "intent",
-            )
-            label_embed = self._embed["intent"](label)
-            sim = train_utils.tf_raw_sim(
-                cls_embed[:, tf.newaxis, :], label_embed, None
-            )
+            # label = self._create_bow(
+            #     tf_batch_data["intent_features"],
+            #     tf_batch_data["intent_mask"][0],
+            #     "intent",
+            # )
+            # label_embed = self._embed["intent"](label)
+            # sim = train_utils.tf_raw_sim(
+            #     cls_embed[:, tf.newaxis, :], label_embed, None
+            # )
 
             scores = train_utils.confidence_from_sim(
                 sim_all, self._similarity_type
             )
             out["i_scores"] = scores
 
-        if self.named_entity_recognition:
+        if self._named_entity_recognition:
             sequence_lengths = sequence_lengths - 1
             logits = self._embed["logits"](text_transformed)
             pred_ids = self._crf(logits, sequence_lengths)
             out["e_ids"] = pred_ids
 
         return out
+
+    def predict_dataset(self):
+        return train_utils.create_tf_dataset(
+            self.session_data,
+            1,
+            label_key="intent_ids",
+        )
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 9f76acf74830..0fed3f11eba4 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -193,18 +193,16 @@ def __init__(self, d_model, num_heads, reg_lambda):
 
         self._depth = d_model // self.num_heads
 
-        l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._wq = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l1_regularizer
+            d_model, use_bias=False, kernel_regularizer=l2_regularizer
         )
         self._wk = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l1_regularizer
+            d_model, use_bias=False, kernel_regularizer=l2_regularizer
         )
         self._wv = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l1_regularizer
+            d_model, use_bias=False, kernel_regularizer=l2_regularizer
         )
-
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._dense = tf.keras.layers.Dense(d_model, kernel_regularizer=l2_regularizer)
 
     def _split_heads(self, x):
@@ -432,13 +430,15 @@ def call(self, x, mask, training):
 
 class CRF(tf.keras.layers.Layer):
 
-    def __init__(self, num_tags, name=None):
+    def __init__(self, num_tags, reg_lambda, name=None):
         super().__init__(name=name)
 
         initializer = tf.keras.initializers.GlorotUniform()
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self.transition_params = self.add_weight(
             shape=(num_tags, num_tags),
             initializer=initializer,
+            regularizer=l2_regularizer,
             trainable=True,
             name="transitions",
         )

From 6e65a63c1258269535be57fd688f7bb300ae5e3f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 15 Jan 2020 09:25:16 +0100
Subject: [PATCH 088/633] create RasaModel

---
 .../embedding_intent_classifier.py            |  12 +-
 rasa/utils/tf_models.py                       | 136 ++++++++++++++++++
 rasa/utils/train_utils.py                     |  86 -----------
 3 files changed, 142 insertions(+), 92 deletions(-)
 create mode 100644 rasa/utils/tf_models.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7692c98fcffc..b03f1108ff2d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -20,6 +20,7 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.utils import train_utils
 from rasa.utils import tf_layers
+from rasa.utils import tf_models
 from rasa.utils.train_utils import SessionDataType, TrainingMetrics
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
@@ -767,8 +768,7 @@ def train(
             self.batch_strategy,
         )
 
-        train_utils.train_tf_dataset(
-            self.model,
+        self.model.fit(
             self.epochs,
             self.batch_size,
             self.evaluate_on_num_examples,
@@ -1054,12 +1054,12 @@ def load(
                 meta["batch_strategy"],
             )
 
-            train_utils.train_tf_dataset(
-                model,
+            model.fit(
                 1,
                 1,
                 0,
                 0,
+                silent=True,
             )
 
             model.load_weights(model_file)
@@ -1094,7 +1094,7 @@ def load(
             return cls(component_config=meta)
 
 
-class DIET(tf.keras.models.Model):
+class DIET(tf_models.RasaModel):
     @staticmethod
     def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
 
@@ -1487,7 +1487,7 @@ def _train_losses_scores(self, batch_in):
 
         return losses, scores
 
-    def train(self, batch_in):
+    def train_on_batch(self, batch_in):
         with tf.GradientTape() as tape:
             losses, scores = self._train_losses_scores(batch_in)
             total_loss = tf.math.add_n(list(losses.values())) + self.losses
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
new file mode 100644
index 000000000000..4a8a0140d834
--- /dev/null
+++ b/rasa/utils/tf_models.py
@@ -0,0 +1,136 @@
+import typing
+import logging
+from typing import (
+    List,
+    Optional,
+    Text,
+    Dict,
+    Tuple,
+    Union,
+    Generator,
+    Callable,
+    Any,
+    NamedTuple,
+)
+from tqdm import tqdm
+from rasa.utils import train_utils
+from rasa.utils.common import is_logging_disabled
+import tensorflow as tf
+
+logger = logging.getLogger(__name__)
+
+
+class RasaModel(tf.keras.models.Model):
+
+    def compile(self):
+        raise NotImplemented
+
+    @staticmethod
+    def _update_postfix_dict(
+            postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
+    ) -> Dict[Text, Text]:
+        for name, value in metrics.loss.items():
+            postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
+        for name, value in metrics.score.items():
+            postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
+        return postfix_dict
+
+    def fit(self,
+            epochs: int,
+            batch_size: Union[List[int], int],
+            evaluate_on_num_examples: int,
+            evaluate_every_num_epochs: int,
+            silent: bool = False,
+            output_file: Optional[Text] = None,
+    ) -> None:
+        """Train tf graph"""
+
+        if evaluate_on_num_examples > 0:
+            logger.info(
+                f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
+                f"epochs."
+            )
+        disable = silent or is_logging_disabled()
+        pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
+
+        # allows increasing batch size
+        train_dataset_func = tf.function(self.train_dataset)
+        eval_dataset_func = tf.function(self.eval_dataset)
+
+        tf_batch_size = tf.ones((), tf.int32)
+        train_func = tf.function(
+            self.train_on_batch, input_signature=[train_dataset_func(tf_batch_size).element_spec]
+        )
+        if evaluate_on_num_examples > 0:
+            eval_func = tf.function(
+                self.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
+            )
+        else:
+            eval_func = None
+
+        for ep in pbar:
+            ep_batch_size = tf_batch_size * train_utils.linearly_increasing_batch_size(
+                ep, batch_size, epochs
+            )
+
+            # Reset the metrics
+            for metric in self.train_metrics.values():
+                metric.reset_states()
+
+            # Train on batches
+            self.set_training_phase(True)
+            for batch_in in train_dataset_func(ep_batch_size):
+                train_func(batch_in)
+
+            # Get the metric results
+            postfix_dict = {k: v.result().numpy() for k, v in self.train_metrics.items()}
+
+            if evaluate_on_num_examples > 0:
+                if (
+                    ep == 0
+                    or (ep + 1) % evaluate_every_num_epochs == 0
+                    or (ep + 1) == epochs
+                ):
+                    # Reset the metrics
+                    for metric in self.eval_metrics.values():
+                        metric.reset_states()
+
+                    # Eval on batches
+                    self.set_training_phase(False)
+                    for batch_in in eval_dataset_func(ep_batch_size):
+                        eval_func(batch_in)
+
+                # Get the metric results
+                postfix_dict.update(
+                    {k: v.result().numpy() for k, v in self.eval_metrics.items()}
+                )
+
+            pbar.set_postfix(postfix_dict)
+
+            # _write_training_metrics(output_file, ep, train_metrics, val_metrics)
+        if not disable:
+            logger.info("Finished training.")
+
+    def evaluate(self):
+        pass
+
+    def predict(self):
+        pass
+
+    def train_on_batch(self, batch_in):
+        raise NotImplementedError
+
+    def test_on_batch(self):
+        raise NotImplemented
+
+    def predict_on_batch(self):
+        raise NotImplemented
+
+    def fit_generator(self):
+        raise NotImplemented
+
+    def evaluate_generator(self):
+        raise NotImplemented
+
+    def predict_generator(self):
+        raise NotImplemented
\ No newline at end of file
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index f6e1e38e3153..5a4cc0708d84 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -949,92 +949,6 @@ def output_validation_stat(
     return ep_val_metrics
 
 
-def train_tf_dataset(
-    model,
-    epochs: int,
-    batch_size: Union[List[int], int],
-    evaluate_on_num_examples: int,
-    evaluate_every_num_epochs: int,
-    output_file: Optional[Text] = None,
-) -> None:
-    """Train tf graph"""
-
-    if evaluate_on_num_examples > 0:
-        logger.info(
-            f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
-            f"epochs."
-        )
-    pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
-
-    # allows increasing batch size
-    train_dataset_func = tf.function(model.train_dataset)
-    eval_dataset_func = tf.function(model.eval_dataset)
-
-    tf_batch_size = tf.ones((), tf.int32)
-    train_func = tf.function(
-        model.train, input_signature=[train_dataset_func(tf_batch_size).element_spec]
-    )
-    if evaluate_on_num_examples > 0:
-        eval_func = tf.function(
-            model.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
-        )
-    else:
-        eval_func = None
-
-    for ep in pbar:
-        ep_batch_size = tf_batch_size * linearly_increasing_batch_size(
-            ep, batch_size, epochs
-        )
-
-        # Reset the metrics
-        for metric in model.train_metrics.values():
-            metric.reset_states()
-
-        # Train on batches
-        model.set_training_phase(True)
-        for batch_in in train_dataset_func(ep_batch_size):
-            train_func(batch_in)
-
-        # Get the metric results
-        postfix_dict = {k: v.result().numpy() for k, v in model.train_metrics.items()}
-
-        if evaluate_on_num_examples > 0:
-            if (
-                ep == 0
-                or (ep + 1) % evaluate_every_num_epochs == 0
-                or (ep + 1) == epochs
-            ):
-                # Reset the metrics
-                for metric in model.eval_metrics.values():
-                    metric.reset_states()
-
-                # Eval on batches
-                model.set_training_phase(False)
-                for batch_in in eval_dataset_func(ep_batch_size):
-                    eval_func(batch_in)
-
-            # Get the metric results
-            postfix_dict.update(
-                {k: v.result().numpy() for k, v in model.eval_metrics.items()}
-            )
-
-        pbar.set_postfix(postfix_dict)
-
-        # _write_training_metrics(output_file, ep, train_metrics, val_metrics)
-
-    logger.info("Finished training.")
-
-
-def _update_postfix_dict(
-    postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
-) -> Dict[Text, Text]:
-    for name, value in metrics.loss.items():
-        postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
-    for name, value in metrics.score.items():
-        postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
-    return postfix_dict
-
-
 def _write_training_metrics(
     output_file: Text,
     epoch: int,

From 8eda6dddccd4544a0689b3c11384cad47e1b5354 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 15 Jan 2020 10:00:00 +0100
Subject: [PATCH 089/633] Refactor text featurizer.

---
 rasa/nlu/components.py                        |   5 +-
 .../sparse_featurizer/entity_featurizer.py    | 281 ----------------
 .../sparse_featurizer/text_featurizer.py      | 306 ++++++++++++++++++
 tests/nlu/featurizers/test_text_featurizer.py |  85 +++++
 4 files changed, 395 insertions(+), 282 deletions(-)
 delete mode 100644 rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py
 create mode 100644 rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
 create mode 100644 tests/nlu/featurizers/test_text_featurizer.py

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index c305a2e2eb8a..448deb70ca1d 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -314,7 +314,10 @@ def provide_context(self) -> Optional[Dict[Text, Any]]:
         pass
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         """Train this component.
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py
deleted file mode 100644
index decad2738013..000000000000
--- a/rasa/nlu/featurizers/sparse_featurizer/entity_featurizer.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import logging
-from collections import defaultdict
-
-import numpy as np
-import os
-import pickle
-import typing
-import scipy.sparse
-from typing import Any, Dict, Optional, Text, List
-
-from rasa.nlu.featurizers.featurizer import Featurizer
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    TOKENS_NAMES,
-    TEXT_ATTRIBUTE,
-    SPARSE_FEATURE_NAMES,
-    SPACY_DOCS,
-)
-
-logger = logging.getLogger(__name__)
-
-if typing.TYPE_CHECKING:
-    from rasa.nlu.model import Metadata
-
-try:
-    import spacy
-except ImportError:
-    spacy = None
-
-
-class CRFToken(typing.NamedTuple):
-    text: Text
-    pos_tag: Text
-    pattern: Dict[Text, Any]
-
-
-class EntityFeaturizer(Featurizer):
-
-    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
-
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
-
-    defaults = {
-        # crf_features is [before, word, after] array with before, word,
-        # after holding keys about which
-        # features to use for each word, for example, 'title' in
-        # array before will have the feature
-        # "is the preceding word in title case?"
-        # POS features require spaCy to be installed
-        "features": [
-            ["low", "title", "upper"],
-            [
-                "bias",
-                "low",
-                "prefix5",
-                "prefix2",
-                "suffix5",
-                "suffix3",
-                "suffix2",
-                "upper",
-                "title",
-                "digit",
-                "pattern",
-            ],
-            ["low", "title", "upper"],
-        ]
-    }
-
-    function_dict = {
-        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
-        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
-        "prefix5": lambda crf_token: crf_token.text[:5],
-        "prefix2": lambda crf_token: crf_token.text[:2],
-        "suffix5": lambda crf_token: crf_token.text[-5:],
-        "suffix3": lambda crf_token: crf_token.text[-3:],
-        "suffix2": lambda crf_token: crf_token.text[-2:],
-        "suffix1": lambda crf_token: crf_token.text[-1:],
-        "pos": lambda crf_token: crf_token.pos_tag,
-        "pos2": lambda crf_token: crf_token.pos_tag[:2],
-        "bias": lambda crf_token: "bias",
-        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
-        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
-        "pattern": lambda crf_token: crf_token.pattern,
-    }
-
-    def __init__(
-        self,
-        component_config: Dict[Text, Any],
-        feature_id_dict: Optional[Dict[Text, Dict[Text, int]]] = None,
-    ):
-        super().__init__(component_config)
-
-        self.feature_id_dict = feature_id_dict
-        self._check_pos_features_and_spacy()
-
-    def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
-    ) -> None:
-        self.feature_id_dict = self._create_feature_id_dict(training_data)
-
-        for example in training_data.training_examples:
-            self._text_features_for_entities(example)
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-        self._text_features_for_entities(message)
-
-    def _text_features_for_entities(self, message: Message) -> None:
-        tokens = self._from_text_to_crf(message)
-        features = self._sentence_to_features(tokens)
-
-        num_features = sum(
-            [
-                len(feature_vals.values())
-                for feature_vals in self.feature_id_dict.values()
-            ]
-        )
-
-        vec = np.zeros([len(tokens), num_features])
-
-        # convert features into one-hot
-        for token_idx, token in enumerate(features):
-            for k, v in token.items():
-                if k in self.feature_id_dict and str(v) in self.feature_id_dict[k]:
-                    feature_idx = self.feature_id_dict[k][str(v)]
-                    vec[token_idx][feature_idx] = 1
-
-        entity_features = scipy.sparse.coo_matrix(vec)
-
-        # set features
-        features = self._combine_with_existing_sparse_features(
-            message, entity_features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        )
-        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], features)
-
-    def _create_feature_id_dict(
-        self, training_data: TrainingData
-    ) -> Dict[Text, Dict[Text, int]]:
-        features = []
-        for example in training_data.training_examples:
-            tokens = self._from_text_to_crf(example)
-            features.append(self._sentence_to_features(tokens))
-
-        # build vocab of features
-        vocab_x = defaultdict(set)
-        for sent_features in features:
-            for token_features in sent_features:
-                for key, val in token_features.items():
-                    vocab_x[key].add(val)
-
-        feature_id_dict = {}
-        offset = 0
-        for key, val in vocab_x.items():
-            feature_id_dict[key] = {
-                str(feature_val): idx
-                for idx, feature_val in enumerate(sorted(val), offset)
-            }
-            offset += len(val)
-
-        return feature_id_dict
-
-    def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any]]:
-        """Convert a word into discrete features in self.crf_features,
-        including word before and word after."""
-
-        configured_features = self.component_config["features"]
-        sentence_features = []
-
-        for word_idx in range(len(sentence)):
-            # word before(-1), current word(0), next word(+1)
-            feature_span = len(configured_features)
-            half_span = feature_span // 2
-            feature_range = range(-half_span, half_span + 1)
-            prefixes = [str(i) for i in feature_range]
-            word_features = {}
-            for f_i in feature_range:
-                if word_idx + f_i >= len(sentence):
-                    word_features["EOS"] = True
-                    # End Of Sentence
-                elif word_idx + f_i < 0:
-                    word_features["BOS"] = True
-                    # Beginning Of Sentence
-                else:
-                    word = sentence[word_idx + f_i]
-                    f_i_from_zero = f_i + half_span
-                    prefix = prefixes[f_i_from_zero]
-                    features = configured_features[f_i_from_zero]
-                    for feature in features:
-                        if feature == "pattern":
-                            # add all regexes as a feature
-                            regex_patterns = self.function_dict[feature](word)
-                            # pytype: disable=attribute-error
-                            for p_name, matched in regex_patterns.items():
-                                feature_name = prefix + ":" + feature + ":" + p_name
-                                word_features[feature_name] = matched
-                            # pytype: enable=attribute-error
-                        else:
-                            # append each feature to a feature vector
-                            value = self.function_dict[feature](word)
-                            word_features[prefix + ":" + feature] = value
-            sentence_features.append(word_features)
-        return sentence_features
-
-    def _from_text_to_crf(self, message: Message) -> List[CRFToken]:
-        """Takes a sentence and switches it to crfsuite format."""
-
-        crf_format = []
-        if self.pos_features:
-            tokens = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
-        else:
-            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
-
-        for i, token in enumerate(tokens):
-            pattern = self.__pattern_of_token(message, i)
-            pos_tag = self.__tag_of_token(token) if self.pos_features else None
-
-            crf_format.append(CRFToken(token.text, pos_tag, pattern))
-
-        return crf_format
-
-    @staticmethod
-    def __pattern_of_token(message, i):
-        if message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
-            return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get("pattern", {})
-        else:
-            return {}
-
-    @staticmethod
-    def __tag_of_token(token):
-        if spacy.about.__version__ > "2" and token._.has("tag"):
-            return token._.get("tag")
-        else:
-            return token.tag_
-
-    def _check_pos_features_and_spacy(self):
-        import itertools
-
-        features = self.component_config.get("features", [])
-        fts = set(itertools.chain.from_iterable(features))
-        self.pos_features = "pos" in fts or "pos2" in fts
-        if self.pos_features:
-            self._check_spacy()
-
-    @staticmethod
-    def _check_spacy():
-        if spacy is None:
-            raise ImportError(
-                "Failed to import `spaCy`. "
-                "`spaCy` is required for POS features "
-                "See https://spacy.io/usage/ for installation"
-                "instructions."
-            )
-
-    @classmethod
-    def load(
-        cls,
-        meta: Dict[Text, Any],
-        model_dir: Optional[Text] = None,
-        model_metadata: Optional["Metadata"] = None,
-        cached_component: Optional["EntityFeaturizer"] = None,
-        **kwargs: Any,
-    ) -> "EntityFeaturizer":
-
-        file_name = meta.get("file")
-
-        with open(
-            os.path.join(model_dir, file_name + ".feature_id_dict.pkl"), "rb"
-        ) as f:
-            feature_id_dict = pickle.load(f)
-
-        return EntityFeaturizer(meta, feature_id_dict=feature_id_dict)
-
-    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
-        """Persist this model into the passed directory.
-        Return the metadata necessary to load the model again."""
-        with open(
-            os.path.join(model_dir, file_name + ".feature_id_dict.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.feature_id_dict, f)
-
-        return {"file": file_name}
diff --git a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
new file mode 100644
index 000000000000..c6cb4c683849
--- /dev/null
+++ b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
@@ -0,0 +1,306 @@
+import logging
+from collections import defaultdict, OrderedDict
+
+import numpy as np
+import os
+import pickle
+import typing
+import scipy.sparse
+from typing import Any, Dict, Optional, Text, List
+
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
+
+logger = logging.getLogger(__name__)
+
+if typing.TYPE_CHECKING:
+    from rasa.nlu.model import Metadata
+
+try:
+    import spacy
+except ImportError:
+    spacy = None
+
+
+class Word(typing.NamedTuple):
+    text: Text
+    pos_tag: Text
+    pattern: Dict[Text, Any]
+
+
+class TextFeaturizer(Featurizer):
+
+    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
+
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+
+    defaults = {
+        # 'features' is [before, word, after] array with before, word,
+        # after holding keys about which features to use for each word,
+        # for example, 'title' in array before will have the feature
+        # "is the preceding word in title case?"
+        # POS features require spaCy to be installed
+        "features": [
+            ["low", "title", "upper"],
+            [
+                "bias",
+                "low",
+                "prefix5",
+                "prefix2",
+                "suffix5",
+                "suffix3",
+                "suffix2",
+                "upper",
+                "title",
+                "digit",
+                "pattern",
+            ],
+            ["low", "title", "upper"],
+        ]
+    }
+
+    function_dict = {
+        "low": lambda word: word.text.islower(),
+        "title": lambda word: word.text.istitle(),
+        "prefix5": lambda word: word.text[:5],
+        "prefix2": lambda word: word.text[:2],
+        "suffix5": lambda word: word.text[-5:],
+        "suffix3": lambda word: word.text[-3:],
+        "suffix2": lambda word: word.text[-2:],
+        "suffix1": lambda word: word.text[-1:],
+        "pos": lambda word: word.pos_tag,
+        "pos2": lambda word: word.pos_tag[:2],
+        "bias": lambda word: "bias",
+        "upper": lambda word: word.text.isupper(),
+        "digit": lambda word: word.text.isdigit(),
+        "pattern": lambda word: word.pattern,
+    }
+
+    def __init__(
+        self,
+        component_config: Dict[Text, Any],
+        feature_to_idx_dict: Optional[Dict[Text, Any]] = None,
+    ):
+        super().__init__(component_config)
+
+        self.feature_to_idx_dict = feature_to_idx_dict
+        self._check_pos_features_and_spacy()
+
+    def _check_pos_features_and_spacy(self):
+        import itertools
+
+        features = set(
+            itertools.chain.from_iterable(self.component_config.get("features", []))
+        )
+        self.pos_features = "pos" in features or "pos2" in features
+
+        if self.pos_features and spacy is None:
+            raise ImportError(
+                "Failed to import `spaCy`. `spaCy` is required for POS features. "
+                "See https://spacy.io/usage/ for installation instructions."
+            )
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+        self.feature_to_idx_dict = self._create_feature_to_idx_dict(training_data)
+
+        for example in training_data.training_examples:
+            self._create_text_features(example)
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        self._create_text_features(message)
+
+    def _create_text_features(self, message: Message) -> None:
+        """Convert incoming messages into sparse features using the configured
+        features."""
+
+        words = self._convert_to_words(message)
+        word_features = self._words_to_features(words)
+        features = self._features_to_one_hot(word_features)
+        features = self._combine_with_existing_sparse_features(
+            message, features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+        )
+        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], features)
+
+    def _features_to_one_hot(
+        self, word_features: List[Dict[Text, Any]]
+    ) -> scipy.sparse.spmatrix:
+        """Convert the word features into a one-hot presentation using the indices
+        in the feature-to-idx dictionary."""
+
+        vec = self._initialize_feature_vector(len(word_features))
+
+        for word_idx, word_features in enumerate(word_features):
+            for feature_key, feature_value in word_features.items():
+                if (
+                    feature_key in self.feature_to_idx_dict
+                    and str(feature_value) in self.feature_to_idx_dict[feature_key]
+                ):
+                    feature_idx = self.feature_to_idx_dict[feature_key][
+                        str(feature_value)
+                    ]
+                    vec[word_idx][feature_idx] = 1
+
+        # set vector of CLS token to sum of everything
+        vec[-1] = np.sum(vec, axis=0)
+
+        return scipy.sparse.coo_matrix(vec)
+
+    def _initialize_feature_vector(self, number_of_tokens: int) -> np.ndarray:
+        """Initialize a feature vector of size number-of-tokens x number-of-features
+        with zeros."""
+
+        number_of_features = sum(
+            [
+                len(feature_values.values())
+                for feature_values in self.feature_to_idx_dict.values()
+            ]
+        )
+        # +1 for the CLS token
+        return np.zeros([number_of_tokens + 1, number_of_features])
+
+    def _create_feature_to_idx_dict(
+        self, training_data: TrainingData
+    ) -> Dict[Text, Dict[Text, int]]:
+        """Create dictionary of all feature values.
+
+        Each feature key, defined in the component configuration, points to
+        different feature values and their indices in the overall resulting
+        feature vector."""
+
+        # get all possible feature values
+        features = []
+        for example in training_data.training_examples:
+            words = self._convert_to_words(example)
+            features.append(self._words_to_features(words))
+
+        # build vocabulary of features
+        feature_vocabulary = defaultdict(set)
+        for sent_features in features:
+            for word_features in sent_features:
+                for feature_name, feature_value in word_features.items():
+                    feature_vocabulary[feature_name].add(feature_value)
+
+        feature_vocabulary = OrderedDict(sorted(feature_vocabulary.items()))
+
+        # assign a unique index to each feature value
+        feature_to_idx_dict = {}
+        offset = 0
+        for feature_name, feature_values in feature_vocabulary.items():
+            feature_to_idx_dict[feature_name] = {
+                str(feature_value): feature_idx
+                for feature_idx, feature_value in enumerate(
+                    sorted(feature_values), start=offset
+                )
+            }
+            offset += len(feature_values)
+        print(feature_to_idx_dict)
+        return feature_to_idx_dict
+
+    def _words_to_features(self, words: List[Word]) -> List[Dict[Text, Any]]:
+        """Convert words into discrete features."""
+
+        configured_features = self.component_config["features"]
+        words_features = []
+
+        for word_idx in range(len(words)):
+            # get the window size (e.g. before, word, after) of the configured features
+            # in case of an even number we will look at one more word before,
+            # e.g. window size 4 will result in a window range of
+            # [-2, -1, 0, 1] (0 = current word in sentence)
+            window_size = len(configured_features)
+            half_window_size = window_size // 2
+            window_range = range(-half_window_size, half_window_size + window_size % 2)
+
+            prefixes = [str(i) for i in window_range]
+
+            word_features = {}
+
+            for pointer_position in window_range:
+                if word_idx + pointer_position >= len(words):
+                    word_features["EOS"] = True
+                elif word_idx + pointer_position < 0:
+                    word_features["BOS"] = True
+                else:
+                    word = words[word_idx + pointer_position]
+
+                    current_feature_idx = pointer_position + half_window_size
+                    prefix = prefixes[current_feature_idx]
+                    features = configured_features[current_feature_idx]
+
+                    for feature in features:
+                        if feature == "pattern":
+                            # add all regexes as a feature
+                            regex_patterns = self.function_dict[feature](word)
+                            for p_name, matched in regex_patterns.items():
+                                feature_name = prefix + ":" + feature + ":" + p_name
+                                word_features[feature_name] = matched
+                        else:
+                            # append each feature to a feature vector
+                            value = self.function_dict[feature](word)
+                            word_features[prefix + ":" + feature] = value
+
+            words_features.append(word_features)
+
+        return words_features
+
+    def _convert_to_words(self, message: Message) -> List[Word]:
+        """Takes a sentence and switches it to crfsuite format."""
+
+        words = []
+        if self.pos_features:
+            tokens = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+        else:
+            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
+            # remove CLS token
+            tokens = tokens[:-1]
+
+        for i, token in enumerate(tokens):
+            pattern = token.get("pattern", {})
+            pos_tag = self._tag_of_token(token) if self.pos_features else None
+
+            words.append(Word(token.text, pos_tag, pattern))
+
+        return words
+
+    @staticmethod
+    def _tag_of_token(token):
+        if spacy.about.__version__ > "2" and token._.has("tag"):
+            return token._.get("tag")
+        else:
+            return token.tag_
+
+    @classmethod
+    def load(
+        cls,
+        meta: Dict[Text, Any],
+        model_dir: Optional[Text] = None,
+        model_metadata: Optional["Metadata"] = None,
+        cached_component: Optional["TextFeaturizer"] = None,
+        **kwargs: Any,
+    ) -> "TextFeaturizer":
+
+        file_name = meta.get("file")
+
+        with open(
+            os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "rb"
+        ) as f:
+            feature_to_idx_dict = pickle.load(f)
+
+        return TextFeaturizer(meta, feature_to_idx_dict=feature_to_idx_dict)
+
+    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
+        """Persist this model into the passed directory.
+        Return the metadata necessary to load the model again."""
+        with open(
+            os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "wb"
+        ) as f:
+            pickle.dump(self.feature_to_idx_dict, f)
+
+        return {"file": file_name}
diff --git a/tests/nlu/featurizers/test_text_featurizer.py b/tests/nlu/featurizers/test_text_featurizer.py
new file mode 100644
index 000000000000..4cc7b5364bf9
--- /dev/null
+++ b/tests/nlu/featurizers/test_text_featurizer.py
@@ -0,0 +1,85 @@
+import numpy as np
+import pytest
+
+import scipy.sparse
+
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.featurizers.sparse_featurizer.text_featurizer import TextFeaturizer
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.constants import TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
+from rasa.nlu.training_data import Message
+
+
+@pytest.mark.parametrize(
+    "sentence, expected, expected_cls",
+    [
+        (
+            "hello goodbye hello",
+            [[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0]],
+            [[2.0, 3.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0]],
+        ),
+        (
+            "a 1 2",
+            [[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0]],
+            [[2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0]],
+        ),
+    ],
+)
+def test_text_featurizer(sentence, expected, expected_cls):
+    featurizer = TextFeaturizer(
+        {"features": [["upper"], ["prefix2", "suffix2", "digit"], ["low"]]}
+    )
+
+    train_message = Message(sentence)
+    test_message = Message(sentence)
+
+    WhitespaceTokenizer().process(train_message)
+    WhitespaceTokenizer().process(test_message)
+
+    featurizer.train(TrainingData([train_message]))
+
+    featurizer.process(test_message)
+
+    assert isinstance(
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+    )
+
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+
+    assert np.all(actual[0] == expected)
+    assert np.all(actual[-1] == expected_cls)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected, expected_cls",
+    [
+        (
+            "hello 123 hello 123 hello",
+            [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]],
+            [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 1.0]],
+        )
+    ],
+)
+def test_text_featurizer_window_size(sentence, expected, expected_cls):
+    featurizer = TextFeaturizer(
+        {"features": [["upper"], ["digit"], ["low"], ["digit"]]}
+    )
+
+    train_message = Message(sentence)
+    test_message = Message(sentence)
+
+    WhitespaceTokenizer().process(train_message)
+    WhitespaceTokenizer().process(test_message)
+
+    featurizer.train(TrainingData([train_message]))
+
+    featurizer.process(test_message)
+
+    assert isinstance(
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+    )
+
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+
+    assert np.all(actual[0] == expected)
+    assert np.all(actual[-1] == expected_cls)

From 1597bd36f4a8f5d8239879f1893f582a1881fa50 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 15 Jan 2020 11:01:18 +0100
Subject: [PATCH 090/633] add TODO

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 1 +
 rasa/utils/tf_models.py                             | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b03f1108ff2d..4de14ae04c3e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1425,6 +1425,7 @@ def _entity_loss(
         # CRF preds
         pred_ids = self._crf(logits, sequence_lengths)
 
+        # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
         mask_bool = tf.cast(mask[:, :, 0], tf.bool)
         c_masked = tf.boolean_mask(c, mask_bool)
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 4a8a0140d834..23d26799947e 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -58,7 +58,7 @@ def fit(self,
         eval_dataset_func = tf.function(self.eval_dataset)
 
         tf_batch_size = tf.ones((), tf.int32)
-        train_func = tf.function(
+        train_on_batch_func = tf.function(
             self.train_on_batch, input_signature=[train_dataset_func(tf_batch_size).element_spec]
         )
         if evaluate_on_num_examples > 0:
@@ -80,7 +80,7 @@ def fit(self,
             # Train on batches
             self.set_training_phase(True)
             for batch_in in train_dataset_func(ep_batch_size):
-                train_func(batch_in)
+                train_on_batch_func(batch_in)
 
             # Get the metric results
             postfix_dict = {k: v.result().numpy() for k, v in self.train_metrics.items()}

From b66c7d944ef886076bdbe47476588124c7a5f4c9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 15 Jan 2020 11:04:32 +0100
Subject: [PATCH 091/633] add changelog

---
 changelog/5065.feature.rst | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 changelog/5065.feature.rst

diff --git a/changelog/5065.feature.rst b/changelog/5065.feature.rst
new file mode 100644
index 000000000000..f65d068ab872
--- /dev/null
+++ b/changelog/5065.feature.rst
@@ -0,0 +1,4 @@
+Add ``TextFeaturizer`` to sparse featurizers.
+
+``TextFeaturizer`` does the same featurization as the ``CRFEntityExtractor``. We extracted the featurization into
+a separate component so that the features can be reused and featurization is independent from the entity extraction.
\ No newline at end of file

From f5fb176faad87d135786cc24b3eb4183ea1546e4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 15 Jan 2020 11:37:17 +0100
Subject: [PATCH 092/633] Add documentation

---
 docs/nlu/components.rst                       | 68 +++++++++++++++++++
 .../sparse_featurizer/text_featurizer.py      | 28 +++-----
 2 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index ce7996861f7c..255d96522859 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -305,6 +305,74 @@ CountVectorsFeaturizer
           OOV_words: []  # list of strings
 
 
+TextFeaturizer
+~~~~~~~~~~~~~~~
+
+:Short: Text feature creation to support entity extraction.
+:Outputs:
+   ``text_sparse_features``
+:Requires: ``tokens``
+:Type: Sparse featurizer
+:Description:
+    Creates features for entity extraction.
+    Moves with a sliding window over every token in the user message and creates features according to the configured
+    features (see below).
+    Features could for example be if a token is upper case, if it is a digit, or the prefix of that token (e.g.
+    first two characters).
+:Configuration:
+    You need to configure what kind of text features the featurizer should extract.
+    The following features are available:
+
+    ==============  =============================================================================================
+    Feature Name    Description
+    ==============  =============================================================================================
+    low             Checks if the word is lower case.
+    upper           Checks if the word is upper case.
+    title           Checks if the word starts with an uppercase character and all remaining characters are lowercased.
+    prefix5         Take the first five characters of the word.
+    prefix2         Take the first two characters of the word.
+    suffix5         Take the last five characters of the word.
+    suffix3         Take the last three characters of the word.
+    suffix2         Take the last two characters of the word.
+    suffix1         Take the last character of the word.
+    pos             Take the Part-of-Speech tag of the word (spaCy required).
+    pos2            Take the first two characters of the Part-of-Speech tag of the word (spaCy required).
+    bias            Adds "bias".
+    digit           Checks if the word contains just digits.
+    ==============  =============================================================================================
+
+    As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
+    previous words, the current word in the sliding window, and the next words.
+    You define the features as [before, word, after] array.
+    If you, for example, want to define features for the word before, the current word, and the word after,
+    your features configuration could look like this:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "TextFeaturizer":
+          "features": [
+            ["low", "title", "upper"],
+            [
+              "bias",
+              "low",
+              "prefix5",
+              "prefix2",
+              "suffix5",
+              "suffix3",
+              "suffix2",
+              "upper",
+              "title",
+              "digit",
+            ],
+            ["low", "title", "upper"],
+          ]
+
+    This configuration is also the default configuration.
+
+    .. note:: If you want to make use of ``pos`` or ``pos2`` you need to add ``SpacyNLP`` to your pipeline.
+
+
 Intent Classifiers
 ------------------
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
index c6cb4c683849..0b6dfc5bf645 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
@@ -11,7 +11,12 @@
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    TEXT_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
+    SPACY_DOCS,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -27,7 +32,6 @@
 class Word(typing.NamedTuple):
     text: Text
     pos_tag: Text
-    pattern: Dict[Text, Any]
 
 
 class TextFeaturizer(Featurizer):
@@ -55,7 +59,6 @@ class TextFeaturizer(Featurizer):
                 "upper",
                 "title",
                 "digit",
-                "pattern",
             ],
             ["low", "title", "upper"],
         ]
@@ -75,7 +78,6 @@ class TextFeaturizer(Featurizer):
         "bias": lambda word: "bias",
         "upper": lambda word: word.text.isupper(),
         "digit": lambda word: word.text.isdigit(),
-        "pattern": lambda word: word.pattern,
     }
 
     def __init__(
@@ -235,16 +237,9 @@ def _words_to_features(self, words: List[Word]) -> List[Dict[Text, Any]]:
                     features = configured_features[current_feature_idx]
 
                     for feature in features:
-                        if feature == "pattern":
-                            # add all regexes as a feature
-                            regex_patterns = self.function_dict[feature](word)
-                            for p_name, matched in regex_patterns.items():
-                                feature_name = prefix + ":" + feature + ":" + p_name
-                                word_features[feature_name] = matched
-                        else:
-                            # append each feature to a feature vector
-                            value = self.function_dict[feature](word)
-                            word_features[prefix + ":" + feature] = value
+                        # append each feature to a feature vector
+                        value = self.function_dict[feature](word)
+                        word_features[prefix + ":" + feature] = value
 
             words_features.append(word_features)
 
@@ -255,17 +250,16 @@ def _convert_to_words(self, message: Message) -> List[Word]:
 
         words = []
         if self.pos_features:
-            tokens = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
         else:
             tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
             # remove CLS token
             tokens = tokens[:-1]
 
         for i, token in enumerate(tokens):
-            pattern = token.get("pattern", {})
             pos_tag = self._tag_of_token(token) if self.pos_features else None
 
-            words.append(Word(token.text, pos_tag, pattern))
+            words.append(Word(token.text, pos_tag))
 
         return words
 

From c1432fe53871864ab43338228faaad18a9bf81be Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 15 Jan 2020 13:07:50 +0100
Subject: [PATCH 093/633] Add more tests.

---
 .../sparse_featurizer/text_featurizer.py      | 40 +++++++++----
 rasa/nlu/registry.py                          |  2 +
 tests/nlu/featurizers/test_text_featurizer.py | 57 ++++++++++++++++++-
 tests/nlu/training/test_train.py              |  1 +
 4 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
index 0b6dfc5bf645..1593ab994880 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
@@ -87,7 +87,11 @@ def __init__(
     ):
         super().__init__(component_config)
 
-        self.feature_to_idx_dict = feature_to_idx_dict
+        if feature_to_idx_dict is None:
+            self.feature_to_idx_dict = {}
+        else:
+            self.feature_to_idx_dict = feature_to_idx_dict
+
         self._check_pos_features_and_spacy()
 
     def _check_pos_features_and_spacy(self):
@@ -225,21 +229,28 @@ def _words_to_features(self, words: List[Word]) -> List[Dict[Text, Any]]:
             word_features = {}
 
             for pointer_position in window_range:
-                if word_idx + pointer_position >= len(words):
+                current_idx = word_idx + pointer_position
+
+                # skip, if current_idx is pointing to a non-existing word
+                if current_idx < 0 or current_idx >= len(words):
+                    continue
+
+                # check if we are at the start or at the end
+                if word_idx == len(words) - 1 and pointer_position == 0:
                     word_features["EOS"] = True
-                elif word_idx + pointer_position < 0:
+                elif word_idx == 0 and pointer_position == 0:
                     word_features["BOS"] = True
-                else:
-                    word = words[word_idx + pointer_position]
 
-                    current_feature_idx = pointer_position + half_window_size
-                    prefix = prefixes[current_feature_idx]
-                    features = configured_features[current_feature_idx]
+                word = words[word_idx + pointer_position]
+
+                current_feature_idx = pointer_position + half_window_size
+                prefix = prefixes[current_feature_idx]
+                features = configured_features[current_feature_idx]
 
-                    for feature in features:
-                        # append each feature to a feature vector
-                        value = self.function_dict[feature](word)
-                        word_features[prefix + ":" + feature] = value
+                for feature in features:
+                    # append each feature to a feature vector
+                    value = self.function_dict[feature](word)
+                    word_features[prefix + ":" + feature] = value
 
             words_features.append(word_features)
 
@@ -251,6 +262,11 @@ def _convert_to_words(self, message: Message) -> List[Word]:
         words = []
         if self.pos_features:
             tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
+            if not tokens:
+                raise ValueError(
+                    f"Missing '{SPACY_DOCS[TEXT_ATTRIBUTE]}'. "
+                    f"Make sure to add 'SpacyNLP' to your pipeline."
+                )
         else:
             tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
             # remove CLS token
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 04c7e88df940..f78f9b79915e 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -19,6 +19,7 @@
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
+from rasa.nlu.featurizers.sparse_featurizer.text_featurizer import TextFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
     CountVectorsFeaturizer,
 )
@@ -66,6 +67,7 @@
     SpacyFeaturizer,
     MitieFeaturizer,
     RegexFeaturizer,
+    TextFeaturizer,
     CountVectorsFeaturizer,
     ConveRTFeaturizer,
     # classifiers
diff --git a/tests/nlu/featurizers/test_text_featurizer.py b/tests/nlu/featurizers/test_text_featurizer.py
index 4cc7b5364bf9..971644dedf84 100644
--- a/tests/nlu/featurizers/test_text_featurizer.py
+++ b/tests/nlu/featurizers/test_text_featurizer.py
@@ -6,7 +6,7 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.featurizers.sparse_featurizer.text_featurizer import TextFeaturizer
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
+from rasa.nlu.constants import TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES, SPACY_DOCS
 from rasa.nlu.training_data import Message
 
 
@@ -56,7 +56,7 @@ def test_text_featurizer(sentence, expected, expected_cls):
         (
             "hello 123 hello 123 hello",
             [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]],
-            [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 1.0]],
+            [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 1.0, 1.0]],
         )
     ],
 )
@@ -83,3 +83,56 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls):
 
     assert np.all(actual[0] == expected)
     assert np.all(actual[-1] == expected_cls)
+
+
+def test_text_featurizer_missing_spacy_nlp():
+    featurizer = TextFeaturizer({"features": [["pos", "pos2"]]})
+
+    train_message = Message("Missing spacy.")
+
+    WhitespaceTokenizer().process(train_message)
+
+    with pytest.raises(ValueError) as excpetions:
+        featurizer.train(TrainingData([train_message]))
+
+    assert "Make sure to add 'SpacyNLP' to your pipeline." in str(excpetions.value)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected",
+    [
+        (
+            "The sun is shining",
+            [
+                [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0],
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0],
+            ],
+        )
+    ],
+)
+def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
+    featurizer = TextFeaturizer({"features": [["pos", "pos2"]]})
+
+    train_message = Message(sentence)
+    test_message = Message(sentence)
+
+    WhitespaceTokenizer().process(train_message)
+    WhitespaceTokenizer().process(test_message)
+
+    train_message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    test_message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+
+    featurizer.train(TrainingData([train_message]))
+
+    featurizer.process(test_message)
+
+    assert isinstance(
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+    )
+
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+
+    assert np.all(actual == expected)
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 6085b4451099..4c563174ed40 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -34,6 +34,7 @@ def pipelines_for_tests():
                 "MitieFeaturizer",
                 "SpacyFeaturizer",
                 "RegexFeaturizer",
+                "TextFeaturizer",
                 "CountVectorsFeaturizer",
                 "ConveRTFeaturizer",
                 "MitieEntityExtractor",

From b575883d796e1e135507ced26478eb318151d73a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 15 Jan 2020 14:37:39 +0100
Subject: [PATCH 094/633] fix eager eval

---
 rasa/utils/tf_models.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 950c7677d7ec..8df12389c776 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -71,9 +71,12 @@ def fit(self,
             )
 
         if evaluate_on_num_examples > 0:
-            eval_func = tf.function(
-                self.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
-            )
+            if eager:
+                eval_func = self.eval
+            else:
+                eval_func = tf.function(
+                    self.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
+                )
         else:
             eval_func = None
 

From 0e2133a70230c9c415381204afa040e878cd45cf Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 15 Jan 2020 15:07:46 +0100
Subject: [PATCH 095/633] fix regularization loss

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9e3cab68d963..6e2a17a8b57f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1518,7 +1518,8 @@ def _train_losses_scores(self, batch_in):
     def train_on_batch(self, batch_in):
         with tf.GradientTape() as tape:
             losses, scores = self._train_losses_scores(batch_in)
-            total_loss = tf.math.add_n(list(losses.values())) + self.losses
+            regularization_loss = tf.math.add_n(self.losses)
+            total_loss = tf.math.add_n(list(losses.values())) + regularization_loss
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))

From eadfb7a7f860034d2f066a80cb1a3f63e1065b01 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 15 Jan 2020 15:45:19 +0100
Subject: [PATCH 096/633] set training to False during loading

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 +++-
 rasa/nlu/components.py                              | 2 +-
 rasa/utils/tf_layers.py                             | 6 ------
 rasa/utils/tf_models.py                             | 3 +++
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6e2a17a8b57f..948d73656785 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1090,6 +1090,7 @@ def load(
             model.load_weights(tf_model_file)
 
             # build the graph for prediction
+            model.set_training_phase(False)
             model.session_data = {
                 k: vs
                 for k, vs in model.session_data.items()
@@ -1519,7 +1520,8 @@ def train_on_batch(self, batch_in):
         with tf.GradientTape() as tape:
             losses, scores = self._train_losses_scores(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
-            total_loss = tf.math.add_n(list(losses.values())) + regularization_loss
+            pred_loss = tf.math.add_n(list(losses.values()))
+            total_loss = pred_loss + regularization_loss
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index c305a2e2eb8a..a0a51613170e 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -452,7 +452,7 @@ def __add_to_cache(self, component: Component, cache_key: Optional[Text]) -> Non
 
         if cache_key is not None and self.use_cache:
             self.component_cache[cache_key] = component
-            logger.info(
+            logger.debug(
                 f"Added '{component.name}' to component cache. Key '{cache_key}'."
             )
 
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 0fed3f11eba4..35a2bc007569 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -15,18 +15,12 @@
 import tensorflow as tf
 import tensorflow_addons as tfa
 import numpy as np
-from rasa.utils import train_utils
-
-if typing.TYPE_CHECKING:
-    from tensor2tensor.utils.hparam import HParams
 
 logger = logging.getLogger(__name__)
 
 
 class SparseDropout(tf.keras.layers.Dropout):
     def call(self, inputs, training):
-        if training is None:
-            training = tf.keras.backend.learning_phase()
 
         to_retain_prob = tf.random.uniform(
             tf.shape(inputs.values), 0, 1, inputs.values.dtype
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 8df12389c776..f4fd923fdbea 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -94,6 +94,9 @@ def fit(self,
             for batch_in in train_dataset_func(ep_batch_size):
                 train_on_batch_func(batch_in)
 
+            # print(self.metrics)
+            # exit()
+
             # Get the metric results
             postfix_dict = {k: v.result().numpy() for k, v in self.train_metrics.items()}
 

From 6b67ab5d6fd11cb9092530f222979970029995f0 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 15 Jan 2020 16:36:30 +0100
Subject: [PATCH 097/633] fix entity prediction

---
 .../embedding_intent_classifier.py            | 19 ++++++++++++++-----
 rasa/utils/tf_layers.py                       |  6 +++++-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 948d73656785..08c3dbba59bd 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -345,8 +345,6 @@ def _create_tag_id_dict(
             }
             tag_id_dict["O"] = 0
 
-            print(tag_id_dict)
-
             return tag_id_dict
 
         distinct_tag_ids = set(
@@ -871,12 +869,14 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
         predictions = out["e_ids"].numpy()
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
-
+        # print(len(tags))
+        # print(len(message.get("tokens", [])))
+        # exit()
         if self.bilou_flag:
             tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
 
         entities = self._convert_tags_to_entities(
-            message.text, message.get("tokens", []), tags
+            message.text, message.get("tokens", []), tags,predictions
         )
 
         extracted = self.add_extractor_name(entities)
@@ -884,8 +884,9 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
 
         return entities
 
+    @staticmethod
     def _convert_tags_to_entities(
-            self, text: str, tokens: List[Token], tags: List[Text]
+        text: str, tokens: List[Token], tags: List[Text],predictions
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = "O"
@@ -913,6 +914,13 @@ def _convert_tags_to_entities(
         for entity in entities:
             entity["value"] = text[entity["start"]: entity["end"]]
 
+            if not entity["value"]:
+                print(text)
+                print([t.text for t in tokens])
+                print(tags)
+                print(predictions)
+                exit()
+
         return entities
 
     def process(self, message: "Message", **kwargs: Any) -> None:
@@ -1456,6 +1464,7 @@ def _entity_loss(
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
         mask_bool = tf.cast(mask[:, :, 0], tf.bool)
+        # pick only non padding values and flatten sequences
         c_masked = tf.boolean_mask(c, mask_bool)
         pred_ids_masked = tf.boolean_mask(pred_ids, mask_bool)
         # set `0` prediction to not a prediction
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 35a2bc007569..4f05049b0a03 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -441,7 +441,11 @@ def call(self, logits, sequence_lengths):
         pred_ids, _ = tfa.text.crf.crf_decode(
             logits, self.transition_params, sequence_lengths
         )
-        return pred_ids
+        # set prediction index for padding to `0`
+        mask = tf.sequence_mask(
+            sequence_lengths, maxlen=tf.shape(pred_ids)[1], dtype=pred_ids.dtype)
+
+        return pred_ids * mask
 
     def loss(self, logits, tag_indices, sequence_lengths):
         log_likelihood, _ = tfa.text.crf.crf_log_likelihood(

From 6cc6a113f470e81d3c7cadaa57e34d7a0ab9104b Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Wed, 15 Jan 2020 17:00:53 +0100
Subject: [PATCH 098/633] Update
 rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
index 1593ab994880..3ef927835784 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
@@ -206,7 +206,6 @@ def _create_feature_to_idx_dict(
                 )
             }
             offset += len(feature_values)
-        print(feature_to_idx_dict)
         return feature_to_idx_dict
 
     def _words_to_features(self, words: List[Word]) -> List[Dict[Text, Any]]:

From d9b092d604b32e8e686a123408c2649d69fc83b3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 09:13:50 +0100
Subject: [PATCH 099/633] remove plotter and bilou flag

---
 .../embedding_intent_classifier.py            | 154 +++---------------
 rasa/utils/plotter.py                         | 114 -------------
 2 files changed, 25 insertions(+), 243 deletions(-)
 delete mode 100644 rasa/utils/plotter.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 08c3dbba59bd..3f46f3285622 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -8,23 +8,20 @@
 import warnings
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union
-from shutil import copyfile
 
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 
 import rasa.utils.io as io_utils
-from rasa.utils.plotter import Plotter
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
-from rasa.nlu.components import Component, any_of
+from rasa.nlu.components import any_of
 
 from rasa.utils import train_utils
 from rasa.utils import tf_layers
 from rasa.utils import tf_models
-from rasa.utils.train_utils import SessionDataType, TrainingMetrics
+from rasa.utils.train_utils import SessionDataType
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -46,7 +43,6 @@
     from rasa.nlu.training_data import Message
 
 
-MESSAGE_BILOU_ENTITIES_ATTRIBUTE = "BILOU_entities"
 shapes, types = None, None
 
 
@@ -157,7 +153,6 @@ class EmbeddingIntentClassifier(EntityExtractor):
         "named_entity_recognition": True,
         "masked_lm_loss": False,
         "sparse_input_dropout": False,
-        "bilou_flag": False,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -165,10 +160,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
     def _check_old_config_variables(config: Dict[Text, Any]) -> None:
         """Config migration warning"""
 
-        removed_tokenization_params = [
-            "label_tokenization_flag",
-            "label_split_symbol",
-        ]
+        removed_tokenization_params = ["label_tokenization_flag", "label_split_symbol"]
         for removed_param in removed_tokenization_params:
             if removed_param in config:
                 warnings.warn(
@@ -259,7 +251,6 @@ def _load_params(self) -> None:
         ]
         self.masked_lm_loss = self.component_config["masked_lm_loss"]
         self.sparse_input_dropout = self.component_config["sparse_input_dropout"]
-        self.bilou_flag = self.component_config["bilou_flag"]
 
     # package safety checks
     @classmethod
@@ -322,31 +313,9 @@ def _create_label_id_dict(
         }
 
     @staticmethod
-    def _create_tag_id_dict(
-        training_data: "TrainingData", bilou_flag: bool
-    ) -> Dict[Text, int]:
+    def _create_tag_id_dict(training_data: "TrainingData") -> Dict[Text, int]:
         """Create label_id dictionary"""
 
-        if bilou_flag:
-            bilou_prefix = ["B-", "I-", "L-", "U-"]
-            distinct_tag_ids = set(
-                [
-                    e[2:]
-                    for example in training_data.training_examples
-                    if example.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE)
-                    for e in example.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE)
-                ]
-            ) - {""}
-
-            tag_id_dict = {
-                f"{prefix}{tag_id}": idx_1 * len(bilou_prefix) + idx_2 + 1
-                for idx_1, tag_id in enumerate(sorted(distinct_tag_ids))
-                for idx_2, prefix in enumerate(bilou_prefix)
-            }
-            tag_id_dict["O"] = 0
-
-            return tag_id_dict
-
         distinct_tag_ids = set(
             [
                 e["entity"]
@@ -501,7 +470,7 @@ def _extract_labels_precomputed_features(
     #     labels_example: List["Message"],
     # ) -> List[np.ndarray]:
     #     """Compute one-hot representation for the labels"""
-    # 
+    #
     #     return [
     #         np.array(
     #             [
@@ -612,26 +581,10 @@ def _create_session_data(
                     label_ids.append(label_id_dict[e.get(label_attribute)])
 
             if self.named_entity_recognition and tag_id_dict:
-                if self.bilou_flag:
-                    if e.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE):
-                        _tags = [
-                            tag_id_dict[_tag]
-                            if _tag in tag_id_dict
-                            else tag_id_dict["O"]
-                            for _tag in e.get(MESSAGE_BILOU_ENTITIES_ATTRIBUTE)
-                        ]
-                    else:
-                        _tags = [
-                            tag_id_dict["O"]
-                            for _ in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
-                        ]
-                else:
-                    _tags = []
-                    for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
-                        _tag = determine_token_labels(
-                            t, e.get(ENTITIES_ATTRIBUTE), None
-                        )
-                        _tags.append(tag_id_dict[_tag])
+                _tags = []
+                for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
+                    _tag = determine_token_labels(t, e.get(ENTITIES_ATTRIBUTE), None)
+                    _tags.append(tag_id_dict[_tag])
                 # transpose to have seq_len x 1
                 tag_ids.append(np.array([_tags]).T)
 
@@ -664,31 +617,11 @@ def _create_session_data(
         return session_data
 
     # train helpers
-    def _apply_bilou_schema(self, training_data: "TrainingData"):
-        if not self.named_entity_recognition:
-            return
-
-        for example in training_data.training_examples:
-            entities = example.get(ENTITIES_ATTRIBUTE)
-
-            if not entities:
-                continue
-
-            entities = CRFEntityExtractor._convert_example(example)
-            output = CRFEntityExtractor._bilou_tags_from_offsets(
-                example.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), entities
-            )
-
-            example.set(MESSAGE_BILOU_ENTITIES_ATTRIBUTE, output)
-
     def preprocess_train_data(self, training_data: "TrainingData"):
         """Prepares data for training.
 
         Performs sanity checks on training data, extracts encodings for labels.
         """
-        if self.bilou_flag:
-            self._apply_bilou_schema(training_data)
-
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=INTENT_ATTRIBUTE
         )
@@ -698,7 +631,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
             training_data, label_id_dict, attribute=INTENT_ATTRIBUTE
         )
 
-        tag_id_dict = self._create_tag_id_dict(training_data, self.bilou_flag)
+        tag_id_dict = self._create_tag_id_dict(training_data)
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         session_data = self._create_session_data(
@@ -816,9 +749,7 @@ def _predict(self, message: "Message"):
 
         return self.predict_func(batch_in)
 
-    def _predict_label(
-            self, out
-    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
+    def _predict_label(self, out) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
         """Predicts the intent of the provided message."""
 
         label = {"name": None, "confidence": 0.0}
@@ -869,14 +800,9 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
         predictions = out["e_ids"].numpy()
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
-        # print(len(tags))
-        # print(len(message.get("tokens", [])))
-        # exit()
-        if self.bilou_flag:
-            tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
 
         entities = self._convert_tags_to_entities(
-            message.text, message.get("tokens", []), tags,predictions
+            message.text, message.get("tokens", []), tags, predictions
         )
 
         extracted = self.add_extractor_name(entities)
@@ -886,7 +812,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
 
     @staticmethod
     def _convert_tags_to_entities(
-        text: str, tokens: List[Token], tags: List[Text],predictions
+        text: str, tokens: List[Token], tags: List[Text], predictions
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = "O"
@@ -912,7 +838,7 @@ def _convert_tags_to_entities(
             last_tag = tag
 
         for entity in entities:
-            entity["value"] = text[entity["start"]: entity["end"]]
+            entity["value"] = text[entity["start"] : entity["end"]]
 
             if not entity["value"]:
                 print(text)
@@ -950,12 +876,6 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
 
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        # # plot training curves
-        # plotter = Plotter()
-        # plotter.plot_training_curves(self.training_log_file, model_dir)
-        # # copy trainig log file
-        # copyfile(self.training_log_file, os.path.join(model_dir, "training-log.tsv"))
-
         try:
             os.makedirs(os.path.dirname(tf_model_file))
         except OSError as e:
@@ -965,11 +885,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             if e.errno != errno.EEXIST:
                 raise
 
-        self.model.save_weights(tf_model_file, save_format='tf')
+        self.model.save_weights(tf_model_file, save_format="tf")
 
         dummy_session_data = {
-            k: [v[:1] for v in vs]
-            for k, vs in self.model.session_data.items()
+            k: [v[:1] for v in vs] for k, vs in self.model.session_data.items()
         }
 
         with open(
@@ -977,9 +896,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         ) as f:
             pickle.dump(dummy_session_data, f)
 
-        with open(
-            os.path.join(model_dir, file_name + ".label_data.pkl"), "wb"
-        ) as f:
+        with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "wb") as f:
             pickle.dump(self._label_data, f)
 
         with open(
@@ -1087,22 +1004,13 @@ def load(
                 meta["batch_strategy"],
             )
             logger.debug("Loading the model ...")
-            model.fit(
-                1,
-                1,
-                0,
-                0,
-                silent=True,
-                eager=True,
-            )
+            model.fit(1, 1, 0, 0, silent=True, eager=True)
             model.load_weights(tf_model_file)
 
             # build the graph for prediction
             model.set_training_phase(False)
             model.session_data = {
-                k: vs
-                for k, vs in model.session_data.items()
-                if "text" in k
+                k: vs for k, vs in model.session_data.items() if "text" in k
             }
             model.build_for_predict()
             predict_dataset = model.predict_dataset()
@@ -1198,9 +1106,7 @@ def __init__(
         self.session_data = session_data
         self.eval_session_data = eval_session_data
         label_batch = train_utils.prepare_batch(label_data)
-        self.tf_label_data = train_utils.batch_to_session_data(
-            label_batch, label_data
-        )
+        self.tf_label_data = train_utils.batch_to_session_data(label_batch, label_data)
 
         # options
         self._sparse_input_dropout = sparse_input_dropout
@@ -1476,9 +1382,7 @@ def _entity_loss(
         return loss, f1
 
     def _train_losses_scores(self, batch_in):
-        tf_batch_data = train_utils.batch_to_session_data(
-            batch_in, self.session_data
-        )
+        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1506,9 +1410,7 @@ def _train_losses_scores(self, batch_in):
             cls = tf.gather_nd(text_transformed, idxs)
 
             label = self._create_bow(
-                tf_batch_data["label_features"],
-                tf_batch_data["label_mask"][0],
-                "label",
+                tf_batch_data["label_features"], tf_batch_data["label_mask"][0], "label"
             )
             loss, acc = self._intent_loss(cls, label)
             losses["i_loss"] = loss
@@ -1573,9 +1475,7 @@ def build_for_predict(self):
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
     def predict(self, batch_in):
-        tf_batch_data = train_utils.batch_to_session_data(
-            batch_in, self.session_data
-        )
+        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1609,9 +1509,7 @@ def predict(self, batch_in):
             #     cls_embed[:, tf.newaxis, :], label_embed, None
             # )
 
-            scores = train_utils.confidence_from_sim(
-                sim_all, self._similarity_type
-            )
+            scores = train_utils.confidence_from_sim(sim_all, self._similarity_type)
             out["i_scores"] = scores
 
         if self._named_entity_recognition:
@@ -1624,7 +1522,5 @@ def predict(self, batch_in):
 
     def predict_dataset(self):
         return train_utils.create_tf_dataset(
-            self.session_data,
-            1,
-            label_key="label_ids",
+            self.session_data, 1, label_key="label_ids"
         )
diff --git a/rasa/utils/plotter.py b/rasa/utils/plotter.py
deleted file mode 100644
index 515ba7ec38c6..000000000000
--- a/rasa/utils/plotter.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from pathlib import Path
-from typing import Union, List, Text
-
-import numpy as np
-import csv
-
-
-# to enable %matplotlib inline if running in ipynb
-from IPython import get_ipython
-
-ipy = get_ipython()
-if ipy is not None:
-    ipy.run_line_magic("matplotlib", "inline")
-
-
-import matplotlib.pyplot as plt
-
-
-class Plotter(object):
-    """
-    Plots training parameters (loss, f-score, and accuracy) and training weights over time.
-    Input files are the output files 'loss.tsv' and 'weights.txt' from training either a sequence tagger or text
-    classification model.
-    """
-
-    @staticmethod
-    def _extract_evaluation_data(
-        file_name: Text, score: str = "loss", prefix: str = "i"
-    ) -> dict:
-        training_curves = {"train": [], "val": []}
-
-        with open(file_name, "r") as tsvin:
-            tsvin = csv.reader(tsvin, delimiter="\t")
-
-            # determine the column index of loss, f-score and accuracy for train, dev and test split
-            row = next(tsvin, None)
-
-            score = score.upper()
-
-            TRAIN_SCORE = (
-                row.index(f"{prefix.upper()}_{score.upper()}")
-                if f"{prefix.upper()}_{score.upper()}" in row
-                else None
-            )
-            VAL_SCORE = (
-                row.index(f"VAL_{prefix.upper()}_{score.upper()}")
-                if f"VAL_{prefix.upper()}_{score.upper()}" in row
-                else None
-            )
-
-            # then get all relevant values from the tsv
-            for row in tsvin:
-
-                if TRAIN_SCORE is not None:
-                    if row[TRAIN_SCORE] != "_":
-                        training_curves["train"].append(float(row[TRAIN_SCORE]))
-
-                if VAL_SCORE is not None:
-                    if VAL_SCORE < len(row) and row[VAL_SCORE] != "_":
-                        training_curves["val"].append(float(row[VAL_SCORE]))
-                    else:
-                        training_curves["val"].append(0.0)
-
-        return training_curves
-
-    def plot_training_curves(self, file_name: Union[Text], output_folder: Text):
-        if type(output_folder) is str:
-            output_folder = Path(output_folder)
-
-        metrics = {
-            "intent": {"scores": ["loss", "acc"], "prefix": "i"},
-            "entity": {"scores": ["loss", "f1"], "prefix": "e"},
-            "mask": {"scores": ["loss", "acc"], "prefix": "m"},
-        }
-
-        for metric_name, metric_values in metrics.items():
-
-            fig = plt.figure(figsize=(15, 10))
-
-            prefix = metric_values["prefix"]
-            scores = metric_values["scores"]
-
-            output_path = output_folder / f"training_{metric_name}.png"
-
-            for i, score in enumerate(scores):
-                training_curves = self._extract_evaluation_data(
-                    file_name, score, prefix
-                )
-
-                plt.subplot(len(scores), 1, i + 1)
-                if training_curves["train"]:
-                    x = np.arange(0, len(training_curves["train"]))
-                    plt.plot(
-                        x,
-                        training_curves["train"],
-                        label=f"train {metric_name} {score}",
-                    )
-                if training_curves["val"]:
-                    x = np.arange(0, len(training_curves["val"]))
-                    plt.plot(
-                        x, training_curves["val"], label=f"val {metric_name} {score}"
-                    )
-
-                plt.legend(bbox_to_anchor=(1.04, 0), loc="lower left", borderaxespad=0)
-                plt.ylabel(f"{metric_name} {score}")
-                plt.xlabel("epochs")
-
-            # save plots
-            plt.tight_layout(pad=1.0)
-            plt.savefig(output_path, dpi=300)
-            print(
-                f"Loss and acc plots are saved in {output_path}"
-            )  # to let user know the path of the save plots
-            plt.close(fig)

From 2f4c0461178949a0774610ecc26c5ff37c0da863 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 10:03:03 +0100
Subject: [PATCH 100/633] propagate config to DIET

---
 .../embedding_intent_classifier.py            | 525 ++++++------------
 requirements.txt                              |  10 +-
 2 files changed, 180 insertions(+), 355 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3f46f3285622..75ac30b1e980 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -7,11 +7,8 @@
 import typing
 import warnings
 
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, Callable
 
-from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-import rasa.utils.io as io_utils
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -78,13 +75,13 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_a": [],
+        "hidden_layers_sizes_text": [],
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_b": [],
+        "hidden_layers_sizes_label": [],
         # sizes of hidden layers before the embedding layer for tag labels
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_c": [],
+        "hidden_layers_sizes_entities": [],
         # Whether to share the hidden layer weights between input words and labels
         "share_hidden_layers": False,
         # number of units in transformer
@@ -108,9 +105,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # set random seed to any int to get reproducible results
         "random_seed": None,
         # optimizer
-        "optimizer": "Adam",  # can be either 'Adam' (default) or 'Nadam'
         "learning_rate": 0.001,
-        "normalize_loss": False,
         # embedding parameters
         # default dense dimension used if no dense features are present
         "dense_dim": {"text": 512, "label": 20},
@@ -156,101 +151,28 @@ class EmbeddingIntentClassifier(EntityExtractor):
     }
     # end default properties (DOC MARKER - don't remove)
 
-    @staticmethod
-    def _check_old_config_variables(config: Dict[Text, Any]) -> None:
-        """Config migration warning"""
-
-        removed_tokenization_params = ["label_tokenization_flag", "label_split_symbol"]
-        for removed_param in removed_tokenization_params:
-            if removed_param in config:
-                warnings.warn(
-                    f"label tokenization has been moved to Tokenizer components. "
-                    f"Your config still mentions '{removed_param}'. "
-                    f"Tokenization may fail if you specify the parameter here. "
-                    f"Please specify the parameter 'intent_tokenization_flag' "
-                    f"and 'intent_split_symbol' in the "
-                    f"tokenizer of your NLU pipeline",
-                    FutureWarning,
-                )
-
     # init helpers
-    def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layer_sizes = {
-            "text": config["hidden_layers_sizes_a"],
-            "label": config["hidden_layers_sizes_b"],
-            "tag": config["hidden_layers_sizes_c"],
-        }
-        self.share_hidden_layers = config["share_hidden_layers"]
+    def _check_config_parameters(self) -> None:
         if (
-            self.share_hidden_layers
-            and self.hidden_layer_sizes["text"] != self.hidden_layer_sizes["label"]
+            self.component_config["share_hidden_layers"]
+            and self.component_config["hidden_layers_sizes_text"]
+            != self.component_config["hidden_layers_sizes_label"]
         ):
             raise ValueError(
                 "If hidden layer weights are shared,"
-                "hidden_layer_sizes for a and b must coincide."
+                "hidden_layer_sizes for text and label must coincide."
             )
 
-        self.batch_size = config["batch_size"]
-        self.batch_strategy = config["batch_strategy"]
-
-        self.optimizer = config["optimizer"]
-        self.normalize_loss = config["normalize_loss"]
-        self.learning_rate = config["learning_rate"]
-        self.epochs = config["epochs"]
-
-        self.random_seed = self.component_config["random_seed"]
-
-        self.transformer_size = self.component_config["transformer_size"]
-        self.num_transformer_layers = self.component_config["num_transformer_layers"]
-        self.num_heads = self.component_config["num_heads"]
-        self.pos_encoding = self.component_config["pos_encoding"]
-        self.max_seq_length = self.component_config["max_seq_length"]
-        self.unidirectional_encoder = self.component_config["unidirectional_encoder"]
-
-    def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
-        self.embed_dim = config["embed_dim"]
-        self.num_neg = config["num_neg"]
-        self.dense_dim = config["dense_dim"]
-
-        self.similarity_type = config["similarity_type"]
-        self.loss_type = config["loss_type"]
-        if self.similarity_type == "auto":
-            if self.loss_type == "softmax":
-                self.similarity_type = "inner"
-            elif self.loss_type == "margin":
-                self.similarity_type = "cosine"
-
-        self.mu_pos = config["mu_pos"]
-        self.mu_neg = config["mu_neg"]
-        self.use_max_sim_neg = config["use_max_sim_neg"]
-
-        self.scale_loss = config["scale_loss"]
-
-    def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
-        self.C2 = config["C2"]
-        self.C_emb = config["C_emb"]
-        self.droprate = config["droprate"]
-
-    def _load_visual_params(self, config: Dict[Text, Any]) -> None:
-        self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
-        if self.evaluate_every_num_epochs < 1:
-            self.evaluate_every_num_epochs = self.epochs
-        self.evaluate_on_num_examples = config["evaluate_on_num_examples"]
-
-    def _load_params(self) -> None:
-        self._check_old_config_variables(self.component_config)
-        self._tf_config = train_utils.load_tf_config(self.component_config)
-        self._load_nn_architecture_params(self.component_config)
-        self._load_embedding_params(self.component_config)
-        self._load_regularization_params(self.component_config)
-        self._load_visual_params(self.component_config)
-
-        self.intent_classification = self.component_config["intent_classification"]
-        self.named_entity_recognition = self.component_config[
-            "named_entity_recognition"
-        ]
-        self.masked_lm_loss = self.component_config["masked_lm_loss"]
-        self.sparse_input_dropout = self.component_config["sparse_input_dropout"]
+        if self.component_config["similarity_type"] == "auto":
+            if self.component_config["loss_type"] == "softmax":
+                self.component_config["similarity_type"] = "inner"
+            elif self.component_config["loss_type"] == "margin":
+                self.component_config["similarity_type"] = "cosine"
+
+        if self.component_config["evaluate_every_num_epochs"] < 1:
+            self.component_config["evaluate_every_num_epochs"] = self.component_config[
+                "epochs"
+            ]
 
     # package safety checks
     @classmethod
@@ -262,8 +184,8 @@ def __init__(
         component_config: Optional[Dict[Text, Any]] = None,
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        model=None,
-        predict_func=None,
+        model: Optional[tf_models.RasaModel] = None,
+        predict_func: Optional[Callable] = None,
         batch_tuple_sizes: Optional[Dict] = None,
         attention_weights: Optional["tf.Tensor"] = None,
     ) -> None:
@@ -271,7 +193,7 @@ def __init__(
 
         super().__init__(component_config)
 
-        self._load_params()
+        self._check_config_parameters()
 
         # transform numbers to labels
         self.inverted_label_dict = inverted_label_dict
@@ -296,8 +218,6 @@ def __init__(
 
         self.attention_weights = attention_weights
 
-        self.training_log_file = io_utils.create_temporary_file("")
-
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -382,11 +302,6 @@ def _extract_and_add_features(
                     f"don't coincide in '{message.text}' for attribute '{attribute}'."
                 )
 
-        # if attribute != INTENT_ATTRIBUTE:
-        #     # Use only the CLS token vector as features
-        #     sparse_features = sequence_to_sentence_features(sparse_features)
-        #     dense_features = sequence_to_sentence_features(dense_features)
-
         return sparse_features, dense_features
 
     @staticmethod
@@ -432,7 +347,7 @@ def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
         return num_features
 
     def check_input_dimension_consistency(self, session_data: "SessionDataType"):
-        if self.share_hidden_layers:
+        if self.component_config["share_hidden_layers"]:
             num_text_features = self._get_num_of_features(session_data, "text_features")
             num_intent_features = self._get_num_of_features(
                 session_data, "label_features"
@@ -580,7 +495,7 @@ def _create_session_data(
                 if label_id_dict:
                     label_ids.append(label_id_dict[e.get(label_attribute)])
 
-            if self.named_entity_recognition and tag_id_dict:
+            if self.component_config["named_entity_recognition"] and tag_id_dict:
                 _tags = []
                 for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
                     _tag = determine_token_labels(t, e.get(ENTITIES_ATTRIBUTE), None)
@@ -666,7 +581,7 @@ def train(
 
         session_data = self.preprocess_train_data(training_data)
 
-        if self.intent_classification:
+        if self.component_config["intent_classification"]:
             possible_to_train = self._check_enough_labels(session_data)
 
             if not possible_to_train:
@@ -677,11 +592,11 @@ def train(
                 )
                 return
 
-        if self.evaluate_on_num_examples:
+        if self.component_config["evaluate_on_num_examples"]:
             session_data, eval_session_data = train_utils.train_val_split(
                 session_data,
-                self.evaluate_on_num_examples,
-                self.random_seed,
+                self.component_config["evaluate_on_num_examples"],
+                self.component_config["random_seed"],
                 label_key="label_ids",
             )
         else:
@@ -689,46 +604,21 @@ def train(
 
         # TODO set it in the model
         # set random seed
-        tf.random.set_seed(self.random_seed)
+        tf.random.set_seed(self.component_config["random_seed"])
 
         self.model = DIET(
             session_data,
             eval_session_data,
             self._label_data,
-            self.dense_dim,
-            self.embed_dim,
-            self.hidden_layer_sizes,
-            self.share_hidden_layers,
-            self.num_transformer_layers,
-            self.transformer_size,
-            self.num_heads,
-            self.max_seq_length,
-            self.unidirectional_encoder,
-            self.C2,
-            self.droprate,
-            self.sparse_input_dropout,
-            self.num_neg,
-            self.loss_type,
-            self.mu_pos,
-            self.mu_neg,
-            self.use_max_sim_neg,
-            self.C_emb,
-            self.scale_loss,
-            self.similarity_type,
-            self.masked_lm_loss,
-            self.intent_classification,
-            self.named_entity_recognition,
             self.inverted_tag_dict,
-            self.learning_rate,
-            self.batch_strategy,
+            self.component_config,
         )
 
         self.model.fit(
-            self.epochs,
-            self.batch_size,
-            self.evaluate_on_num_examples,
-            self.evaluate_every_num_epochs,
-            output_file=self.training_log_file,
+            self.component_config["epochs"],
+            self.component_config["batch_size"],
+            self.component_config["evaluate_on_num_examples"],
+            self.component_config["evaluate_every_num_epochs"],
         )
 
         # rebuild the graph for prediction
@@ -854,13 +744,13 @@ def process(self, message: "Message", **kwargs: Any) -> None:
 
         out = self._predict(message)
 
-        if self.intent_classification:
+        if self.component_config["intent_classification"]:
             label, label_ranking = self._predict_label(out)
 
             message.set("label", label, add_to_output=True)
             message.set("label_ranking", label_ranking, add_to_output=True)
 
-        if self.named_entity_recognition:
+        if self.component_config["named_entity_recognition"]:
             entities = self._predict_entities(out, message)
 
             message.set("entities", entities, add_to_output=True)
@@ -928,115 +818,75 @@ def load(
     ) -> "EmbeddingIntentClassifier":
         """Loads the trained model from the provided directory."""
 
-        if model_dir and meta.get("file"):
-            file_name = meta.get("file")
-            tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
-
-            with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
-                _tf_config = pickle.load(f)
-
-            with open(
-                os.path.join(model_dir, file_name + ".dummy_session_data.pkl"), "rb"
-            ) as f:
-                dummy_session_data = pickle.load(f)
-
-            with open(
-                os.path.join(model_dir, file_name + ".label_data.pkl"), "rb"
-            ) as f:
-                label_data = pickle.load(f)
-
-            with open(
-                os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "rb"
-            ) as f:
-                inv_label_dict = pickle.load(f)
-
-            with open(
-                os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "rb"
-            ) as f:
-                inv_tag_dict = pickle.load(f)
-
-            with open(
-                os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "rb"
-            ) as f:
-                batch_tuple_sizes = pickle.load(f)
-
-            hidden_layer_sizes = {
-                "text": meta["hidden_layers_sizes_a"],
-                "label": meta["hidden_layers_sizes_b"],
-                "tag": meta["hidden_layers_sizes_c"],
-            }
-            similarity_type = meta["similarity_type"]
-            if similarity_type == "auto":
-                if meta["loss_type"] == "softmax":
-                    similarity_type = "inner"
-                elif meta["loss_type"] == "margin":
-                    similarity_type = "cosine"
-
-            model = DIET(
-                dummy_session_data,
-                None,
-                label_data,
-                meta["dense_dim"],
-                meta["embed_dim"],
-                hidden_layer_sizes,
-                meta["share_hidden_layers"],
-                meta["num_transformer_layers"],
-                meta["transformer_size"],
-                meta["num_heads"],
-                meta["max_seq_length"],
-                meta["unidirectional_encoder"],
-                meta["C2"],
-                meta["droprate"],
-                meta["sparse_input_dropout"],
-                meta["num_neg"],
-                meta["loss_type"],
-                meta["mu_pos"],
-                meta["mu_neg"],
-                meta["use_max_sim_neg"],
-                meta["C_emb"],
-                meta["scale_loss"],
-                similarity_type,
-                meta["masked_lm_loss"],
-                meta["intent_classification"],
-                meta["named_entity_recognition"],
-                inv_tag_dict,
-                meta["learning_rate"],
-                meta["batch_strategy"],
-            )
-            logger.debug("Loading the model ...")
-            model.fit(1, 1, 0, 0, silent=True, eager=True)
-            model.load_weights(tf_model_file)
-
-            # build the graph for prediction
-            model.set_training_phase(False)
-            model.session_data = {
-                k: vs for k, vs in model.session_data.items() if "text" in k
-            }
-            model.build_for_predict()
-            predict_dataset = model.predict_dataset()
-            predict_func = tf.function(
-                model.predict, input_signature=[predict_dataset.element_spec]
-            )
-            batch_in = next(iter(predict_dataset))
-            predict_func(batch_in)
-            logger.debug("Finished loading the model.")
-
-            return cls(
-                component_config=meta,
-                inverted_label_dict=inv_label_dict,
-                inverted_tag_dict=inv_tag_dict,
-                model=model,
-                predict_func=predict_func,
-                batch_tuple_sizes=batch_tuple_sizes,
-            )
-
-        else:
+        if not model_dir or not meta.get("file"):
             warnings.warn(
                 f"Failed to load nlu model. "
                 f"Maybe path '{os.path.abspath(model_dir)}' doesn't exist."
             )
             return cls(component_config=meta)
 
+        file_name = meta.get("file")
+        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
+
+        with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
+            _tf_config = pickle.load(f)
+
+        with open(
+            os.path.join(model_dir, file_name + ".dummy_session_data.pkl"), "rb"
+        ) as f:
+            dummy_session_data = pickle.load(f)
+
+        with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
+            label_data = pickle.load(f)
+
+        with open(
+            os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "rb"
+        ) as f:
+            inv_label_dict = pickle.load(f)
+
+        with open(os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "rb") as f:
+            inv_tag_dict = pickle.load(f)
+
+        with open(
+            os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "rb"
+        ) as f:
+            batch_tuple_sizes = pickle.load(f)
+
+        if meta["similarity_type"] == "auto":
+            if meta["loss_type"] == "softmax":
+                meta["similarity_type"] = "inner"
+            elif meta["loss_type"] == "margin":
+                meta["similarity_type"] = "cosine"
+
+        model = DIET(dummy_session_data, None, label_data, inv_tag_dict, meta)
+
+        logger.debug("Loading the model ...")
+        model.fit(1, 1, 0, 0, silent=True, eager=True)
+        model.load_weights(tf_model_file)
+
+        # build the graph for prediction
+        model.set_training_phase(False)
+        model.session_data = {
+            k: vs for k, vs in model.session_data.items() if "text" in k
+        }
+        model.build_for_predict()
+        predict_dataset = model.predict_dataset()
+        predict_func = tf.function(
+            model.predict, input_signature=[predict_dataset.element_spec]
+        )
+        batch_in = next(iter(predict_dataset))
+        predict_func(batch_in)
+        logger.debug("Finished loading the model.")
+
+        return cls(
+            component_config=meta,
+            inverted_label_dict=inv_label_dict,
+            inverted_tag_dict=inv_tag_dict,
+            model=model,
+            predict_func=predict_func,
+            batch_tuple_sizes=batch_tuple_sizes,
+        )
+
 
 class DIET(tf_models.RasaModel):
     @staticmethod
@@ -1070,35 +920,11 @@ def _input_dim(values, dense_dim):
 
     def __init__(
         self,
-        session_data,
-        eval_session_data,
-        label_data,
-        dense_dim,
-        embed_dim,
-        hidden_layer_sizes,
-        share_hidden_layers,
-        num_transformer_layers,
-        transformer_size,
-        num_heads,
-        max_seq_length,
-        unidirectional_encoder,
-        reg_lambda,
-        droprate,
-        sparse_input_dropout,
-        num_neg,
-        loss_type,
-        mu_pos,
-        mu_neg,
-        use_max_sim_neg,
-        C_emb,
-        scale_loss,
-        similarity_type,
-        masked_lm_loss,
-        intent_classification,
-        named_entity_recognition,
-        inverted_tag_dict,
-        learning_rate,
-        batch_in_strategy,
+        session_data: SessionDataType,
+        eval_session_data: Optional[SessionDataType],
+        label_data: SessionDataType,
+        inverted_tag_dict: Dict[int, Text],
+        config: Dict[Text, Any],
     ):
         super(DIET, self).__init__(name="DIET")
 
@@ -1108,119 +934,126 @@ def __init__(
         label_batch = train_utils.prepare_batch(label_data)
         self.tf_label_data = train_utils.batch_to_session_data(label_batch, label_data)
 
-        # options
-        self._sparse_input_dropout = sparse_input_dropout
-        self._num_neg = num_neg
-        self._loss_type = loss_type
-        self._mu_pos = mu_pos
-        self._mu_neg = mu_neg
-        self._use_max_sim_neg = use_max_sim_neg
-        self._C_emb = C_emb
-        self._scale_loss = scale_loss
-        self._similarity_type = similarity_type
-        self._masked_lm_loss = masked_lm_loss
-        self._intent_classification = intent_classification
-        self._named_entity_recognition = named_entity_recognition
-        self._inverted_tag_dict = inverted_tag_dict
-        self._num_tags = len(inverted_tag_dict)
-        self._batch_in_strategy = batch_in_strategy
+        self.config = config
 
         # tf objects
-        self._sparse_dropout = tf_layers.SparseDropout(rate=droprate)
+        self._prepare_layers(config, session_data)
+
+        # tf tensors
+        self.training = tf.ones((), tf.bool)
+
+        # tf training
+        self._optimizer = tf.keras.optimizers.Adam(config["learning_rate"])
+        self.entity_f1 = tfa.metrics.F1Score(
+            num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+            average="micro",
+        )
+
+        # persist
+        self.all_labels_embed = None
+        self.batch_tuple_sizes = None
+
+    def _prepare_layers(self, session_data):
+        self._sparse_dropout = tf_layers.SparseDropout(rate=self.config["droprate"])
+
         self._sparse_to_dense = {
             "text": self._create_sparse_dense_layer(
-                session_data["text_features"], "text", reg_lambda, dense_dim["text"]
+                session_data["text_features"],
+                "text",
+                self.config["C2"],
+                self.config["dense_dim"]["text"],
             ),
             "label": self._create_sparse_dense_layer(
-                session_data["label_features"], "label", reg_lambda, dense_dim["label"]
+                session_data["label_features"],
+                "label",
+                self.config["C2"],
+                self.config["dense_dim"]["label"],
             ),
         }
 
         self._ffnn = {
             "text": tf_layers.ReluFfn(
-                hidden_layer_sizes["text"],
-                droprate,
-                reg_lambda,
-                "text_intent" if share_hidden_layers else "text",
+                self.config["hidden_layers_sizes_text"],
+                self.config["droprate"],
+                self.config["C2"],
+                "text_intent" if self.config["share_hidden_layers"] else "text",
             ),
             "label": tf_layers.ReluFfn(
-                hidden_layer_sizes["label"],
-                droprate,
-                reg_lambda,
-                "text_intent" if share_hidden_layers else "label",
+                self.config["hidden_layers_sizes_label"],
+                self.config["droprate"],
+                self.config["C2"],
+                "text_intent" if self.config["share_hidden_layers"] else "label",
             ),
         }
 
-        if num_transformer_layers > 0:
+        if self.config["num_transformer_layers"] > 0:
             self._transformer = tf_layers.TransformerEncoder(
-                num_transformer_layers,
-                transformer_size,
-                num_heads,
-                transformer_size * 4,
-                max_seq_length,
-                reg_lambda,
-                droprate,
-                unidirectional_encoder,
+                self.config["num_transformer_layers"],
+                self.config["transformer_size"],
+                self.config["num_heads"],
+                self.config["transformer_size"] * 4,
+                self.config["max_seq_length"],
+                self.config["C2"],
+                self.config["droprate"],
+                self.config["unidirectional_encoder"],
                 name="text_encoder",
             )
         else:
             self._transformer = lambda x, mask, training: x
 
         self._embed = {}
+
         self.train_metrics = {"t_loss": tf.keras.metrics.Mean(name="t_loss")}
         self.eval_metrics = {"val_t_loss": tf.keras.metrics.Mean(name="val_t_loss")}
-        if self._masked_lm_loss:
+
+        self._input_mask = None
+        if self.config["masked_lm_loss"]:
             self._input_mask = tf_layers.InputMask()
             self._embed["text_mask"] = tf_layers.Embed(
-                embed_dim, reg_lambda, "text_mask", similarity_type
+                self.config["embed_dim"],
+                self.config["C2"],
+                "text_mask",
+                self.config["similarity_type"],
             )
             self._embed["text_token"] = tf_layers.Embed(
-                embed_dim, reg_lambda, "text_token", similarity_type
+                self.config["embed_dim"],
+                self.config["C2"],
+                "text_token",
+                self.config["similarity_type"],
             )
             self.train_metrics["m_loss"] = tf.keras.metrics.Mean(name="m_loss")
             self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
             self.eval_metrics["val_m_loss"] = tf.keras.metrics.Mean(name="val_m_loss")
             self.eval_metrics["val_m_acc"] = tf.keras.metrics.Mean(name="val_m_acc")
-        else:
-            self._input_mask = None
 
-        if self._intent_classification:
+        if self.config["intent_classification"]:
             self._embed["text"] = tf_layers.Embed(
-                embed_dim, reg_lambda, "text", similarity_type
+                self.config["embed_dim"],
+                self.config["C2"],
+                "text",
+                self.config["similarity_type"],
             )
             self._embed["label"] = tf_layers.Embed(
-                embed_dim, reg_lambda, "label", similarity_type
+                self.config["embed_dim"],
+                self.config["C2"],
+                "label",
+                self.config["similarity_type"],
             )
             self.train_metrics["i_loss"] = tf.keras.metrics.Mean(name="i_loss")
             self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
             self.eval_metrics["val_i_loss"] = tf.keras.metrics.Mean(name="val_i_loss")
             self.eval_metrics["val_i_acc"] = tf.keras.metrics.Mean(name="val_i_acc")
 
-        if self._named_entity_recognition:
+        self._crf = None
+        if self.config["named_entity_recognition"]:
             self._embed["logits"] = tf_layers.Embed(
-                self._num_tags, reg_lambda, "logits"
+                self._num_tags, self.config["C2"], "logits"
             )
-            self._crf = tf_layers.CRF(self._num_tags, reg_lambda)
+            self._crf = tf_layers.CRF(self._num_tags, self.config["C2"])
             self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
             self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
             self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
             self.eval_metrics["val_e_f1"] = tf.keras.metrics.Mean(name="val_e_f1")
-        else:
-            self._crf = None
-
-        # tf tensors
-        self.training = tf.ones((), tf.bool)
-
-        # tf training
-        self._optimizer = tf.keras.optimizers.Adam(learning_rate)
-        self.entity_f1 = tfa.metrics.F1Score(
-            num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
-            average="micro",
-        )
-
-        # persist
-        self.all_labels_embed = None
-        self.batch_tuple_sizes = None
 
     def set_training_phase(self, training: bool):
         if training:
@@ -1394,14 +1227,14 @@ def _train_losses_scores(self, batch_in):
         losses = {}
         scores = {}
 
-        if self._masked_lm_loss:
+        if self.config["masked_lm_loss"]:
             loss, acc = self._mask_loss(
                 text_transformed, text_in, lm_mask_bool_text, "text"
             )
             losses["m_loss"] = loss
             scores["m_acc"] = acc
 
-        if self._intent_classification:
+        if self.config["intent_classification"]:
             # get _cls_ vector for intent classification
             last_index = tf.maximum(
                 tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
@@ -1416,7 +1249,7 @@ def _train_losses_scores(self, batch_in):
             losses["i_loss"] = loss
             scores["i_acc"] = acc
 
-        if self._named_entity_recognition:
+        if self.config["named_entity_recognition"]:
             tags = tf_batch_data["tag_ids"][0]
 
             loss, f1 = self._entity_loss(
@@ -1485,7 +1318,7 @@ def predict(self, batch_in):
         )
 
         out = {}
-        if self._intent_classification:
+        if self.config["intent_classification"]:
             # get _cls_ vector for intent classification
             last_index = tf.maximum(
                 tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
@@ -1512,7 +1345,7 @@ def predict(self, batch_in):
             scores = train_utils.confidence_from_sim(sim_all, self._similarity_type)
             out["i_scores"] = scores
 
-        if self._named_entity_recognition:
+        if self.config["named_entity_recognition"]:
             sequence_lengths = sequence_lengths - 1
             logits = self._embed["logits"](text_transformed)
             pred_ids = self._crf(logits, sequence_lengths)
diff --git a/requirements.txt b/requirements.txt
index 4c1b9ae24cf8..271924da2522 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,16 +62,8 @@ python-dateutil==2.8.0
 # https://github.com/tensorflow/tensorflow/issues/32319
 gast==0.2.2
 # for new featurizers
-tensorflow==1.14.0
+tensorflow==2.1.0
 tensorflow_hub==0.6.0
 #tensorflow_text[no-deps]==0.1.0
-torch
-torchvision
-transformers
-# for hermit evaluation
-pandas
-progress
-# for plotter
-ipython
 # to calculate f1 score in new architecture
 git+https://github.com/guillaumegenthial/tf_metrics.git

From b5a1009da3b9188c14f050ffa1db11e3b91b6519 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 10:14:35 +0100
Subject: [PATCH 101/633] exchange missing config values

---
 .../embedding_intent_classifier.py            | 53 ++++++++++---------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 75ac30b1e980..5184efd41aa0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -79,9 +79,6 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
         "hidden_layers_sizes_label": [],
-        # sizes of hidden layers before the embedding layer for tag labels
-        # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_entities": [],
         # Whether to share the hidden layer weights between input words and labels
         "share_hidden_layers": False,
         # number of units in transformer
@@ -218,6 +215,8 @@ def __init__(
 
         self.attention_weights = attention_weights
 
+        self._tf_config = train_utils.load_tf_config(self.component_config)
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -577,7 +576,7 @@ def train(
         logger.debug("Started training embedding classifier.")
 
         # set numpy random seed
-        np.random.seed(self.random_seed)
+        np.random.seed(self.component_config["random_seed"])
 
         session_data = self.preprocess_train_data(training_data)
 
@@ -933,11 +932,12 @@ def __init__(
         self.eval_session_data = eval_session_data
         label_batch = train_utils.prepare_batch(label_data)
         self.tf_label_data = train_utils.batch_to_session_data(label_batch, label_data)
+        self._num_tags = len(inverted_tag_dict)
 
         self.config = config
 
         # tf objects
-        self._prepare_layers(config, session_data)
+        self._prepare_layers(session_data)
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
@@ -953,7 +953,7 @@ def __init__(
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
-    def _prepare_layers(self, session_data):
+    def _prepare_layers(self, session_data: SessionDataType):
         self._sparse_dropout = tf_layers.SparseDropout(rate=self.config["droprate"])
 
         self._sparse_to_dense = {
@@ -1102,7 +1102,7 @@ def _create_sequence(
         masked_lm_loss: bool = False,
     ):
         x = self._combine_sparse_dense_features(
-            features, mask, name, sparse_dropout=self._sparse_input_dropout
+            features, mask, name, sparse_dropout=self.config["sparse_input_dropout"]
         )
 
         if masked_lm_loss:
@@ -1138,14 +1138,14 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
             a_masked,
             a_embed,
             a,
-            self._num_neg,
+            self.config["num_neg"],
             None,
-            self._loss_type,
-            self._mu_pos,
-            self._mu_neg,
-            self._use_max_sim_neg,
-            self._C_emb,
-            self._scale_loss,
+            self.config["loss_type"],
+            self.config["mu_pos"],
+            self.config["mu_neg"],
+            self.config["use_max_sim_neg"],
+            self.config["C_emb"],
+            self.config["scale_loss"],
         )
 
     def _build_all_b(self):
@@ -1170,14 +1170,14 @@ def _intent_loss(self, a, b):
             b,
             all_labels_embed,
             all_labels,
-            self._num_neg,
+            self.config["num_neg"],
             None,
-            self._loss_type,
-            self._mu_pos,
-            self._mu_neg,
-            self._use_max_sim_neg,
-            self._C_emb,
-            self._scale_loss,
+            self.config["loss_type"],
+            self.config["mu_pos"],
+            self.config["mu_neg"],
+            self.config["use_max_sim_neg"],
+            self.config["C_emb"],
+            self.config["scale_loss"],
         )
 
     def _entity_loss(
@@ -1221,7 +1221,10 @@ def _train_losses_scores(self, batch_in):
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
 
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, "text", self._masked_lm_loss
+            tf_batch_data["text_features"],
+            mask_text,
+            "text",
+            self.config["masked_lm_loss"],
         )
 
         losses = {}
@@ -1281,7 +1284,7 @@ def train_dataset(self, batch_size):
             self.session_data,
             batch_size,
             label_key="label_ids",
-            batch_strategy=self._batch_in_strategy,
+            batch_strategy=self.config["batch_strategy"],
             shuffle=True,
         )
 
@@ -1342,7 +1345,9 @@ def predict(self, batch_in):
             #     cls_embed[:, tf.newaxis, :], label_embed, None
             # )
 
-            scores = train_utils.confidence_from_sim(sim_all, self._similarity_type)
+            scores = train_utils.confidence_from_sim(
+                sim_all, self.config["similarity_type"]
+            )
             out["i_scores"] = scores
 
         if self.config["named_entity_recognition"]:

From 92d50d7a28e08ab3661322210f7b5a4295e79438 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 11:16:14 +0100
Subject: [PATCH 102/633] create dot product loss layer

---
 .../embedding_intent_classifier.py            |  51 ++-
 rasa/utils/tf_layers.py                       | 352 +++++++++++++++++
 rasa/utils/tf_models.py                       |   7 +-
 rasa/utils/train_utils.py                     | 354 ------------------
 4 files changed, 377 insertions(+), 387 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 08c3dbba59bd..fb59ef01b194 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -869,9 +869,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
         predictions = out["e_ids"].numpy()
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
-        # print(len(tags))
-        # print(len(message.get("tokens", [])))
-        # exit()
+
         if self.bilou_flag:
             tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
 
@@ -914,13 +912,6 @@ def _convert_tags_to_entities(
         for entity in entities:
             entity["value"] = text[entity["start"]: entity["end"]]
 
-            if not entity["value"]:
-                print(text)
-                print([t.text for t in tokens])
-                print(tags)
-                print(predictions)
-                exit()
-
         return entities
 
     def process(self, message: "Message", **kwargs: Any) -> None:
@@ -1271,6 +1262,15 @@ def __init__(
             self._embed["text_token"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "text_token", similarity_type
             )
+            self._loss_mask = tf_layers.DotProductLoss(
+                self._num_neg,
+                self._loss_type,
+                self._mu_pos,
+                self._mu_neg,
+                self._use_max_sim_neg,
+                self._C_emb,
+                self._scale_loss,
+            )
             self.train_metrics["m_loss"] = tf.keras.metrics.Mean(name="m_loss")
             self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
             self.eval_metrics["val_m_loss"] = tf.keras.metrics.Mean(name="val_m_loss")
@@ -1285,6 +1285,15 @@ def __init__(
             self._embed["label"] = tf_layers.Embed(
                 embed_dim, reg_lambda, "label", similarity_type
             )
+            self._loss_label = tf_layers.DotProductLoss(
+                self._num_neg,
+                self._loss_type,
+                self._mu_pos,
+                self._mu_neg,
+                self._use_max_sim_neg,
+                self._C_emb,
+                self._scale_loss,
+            )
             self.train_metrics["i_loss"] = tf.keras.metrics.Mean(name="i_loss")
             self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
             self.eval_metrics["val_i_loss"] = tf.keras.metrics.Mean(name="val_i_loss")
@@ -1393,20 +1402,12 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
 
         a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
 
-        return train_utils.calculate_loss_acc(
+        return self._loss_mask(
             a_t_masked_embed,
             a_embed_masked,
             a_masked,
             a_embed,
             a,
-            self._num_neg,
-            None,
-            self._loss_type,
-            self._mu_pos,
-            self._mu_neg,
-            self._use_max_sim_neg,
-            self._C_emb,
-            self._scale_loss,
         )
 
     def _build_all_b(self):
@@ -1425,20 +1426,12 @@ def _intent_loss(self, a, b):
         a_embed = self._embed["text"](a)
         b_embed = self._embed["label"](b)
 
-        return train_utils.calculate_loss_acc(
+        return self._loss_label(
             a_embed,
             b_embed,
             b,
             all_labels_embed,
             all_labels,
-            self._num_neg,
-            None,
-            self._loss_type,
-            self._mu_pos,
-            self._mu_neg,
-            self._use_max_sim_neg,
-            self._C_emb,
-            self._scale_loss,
         )
 
     def _entity_loss(
@@ -1594,7 +1587,7 @@ def predict(self, batch_in):
             cls = tf.gather_nd(text_transformed, idxs)
             cls_embed = self._embed["text"](cls)
 
-            sim_all = train_utils.tf_raw_sim(
+            sim_all = self._loss_label.sim(
                 cls_embed[:, tf.newaxis, :],
                 self.all_labels_embed[tf.newaxis, :, :],
                 None,
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 4f05049b0a03..bf601b3edb7e 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -452,3 +452,355 @@ def loss(self, logits, tag_indices, sequence_lengths):
             logits, tag_indices, sequence_lengths, self.transition_params
         )
         return tf.reduce_mean(-log_likelihood)
+
+
+class DotProductLoss(tf.keras.layers.Layer):
+
+    def __init__(
+        self,
+        num_neg: int,
+        loss_type: Text,
+        mu_pos: float,
+        mu_neg: float,
+        use_max_sim_neg: bool,
+        neg_lambda: float,
+        scale_loss: bool,
+        name=None
+    ):
+        super().__init__(name=name)
+        self.num_neg = num_neg
+        self.loss_type = loss_type
+        self.mu_pos = mu_pos
+        self.mu_neg = mu_neg
+        self.use_max_sim_neg = use_max_sim_neg
+        self.neg_lambda = neg_lambda
+        self.scale_loss = scale_loss
+
+    @staticmethod
+    def _make_flat(x: "tf.Tensor") -> "tf.Tensor":
+        """Make tensor 2D."""
+
+        return tf.reshape(x, (-1, x.shape[-1]))
+
+    def _random_indices(self, batch_size: "tf.Tensor", total_candidates: "tf.Tensor"):
+
+        # all_indices = tf.tile(
+        #     tf.expand_dims(tf.range(total_candidates), 0),
+        #     (batch_size, 1),
+        # )
+        # shuffled_indices = tf.transpose(
+        #     tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
+        # )
+        # return shuffled_indices[:, :self.num_neg]
+
+        def rand_idxs():
+            """Create random tensor of indices"""
+            # (1, num_neg)
+            return tf.expand_dims(
+                tf.random.shuffle(tf.range(total_candidates))[:self.num_neg], 0)
+
+        def cond(i, out):
+            """Condition for while loop"""
+            return i < batch_size
+
+        def body(i, out):
+            """Body of the while loop"""
+            return [
+                # increment counter
+                i + 1,
+                # add random indices
+                tf.concat([out, rand_idxs()], 0)
+            ]
+
+        # first tensor already created
+        i1 = tf.constant(1)
+        # create first random array of indices
+        out1 = rand_idxs()  # (1, num_neg)
+
+        return tf.while_loop(
+            cond,
+            body,
+            loop_vars=[i1, out1],
+            shape_invariants=[i1.shape, tf.TensorShape([None, self.num_neg])],
+            back_prop=False,
+        )[1]
+
+    @staticmethod
+    def _sample_idxs(
+        batch_size: "tf.Tensor", x: "tf.Tensor", idxs: "tf.Tensor"
+    ) -> "tf.Tensor":
+        """Sample negative examples for given indices"""
+
+        tiled = tf.tile(tf.expand_dims(x, 0), (batch_size, 1, 1))
+
+        return tf.gather(tiled, idxs, batch_dims=-1)
+
+    def _get_bad_mask(
+        self, labels: "tf.Tensor", target_labels: "tf.Tensor", idxs: "tf.Tensor"
+    ) -> "tf.Tensor":
+        """Calculate bad mask for given indices.
+
+        Checks that input features are different for positive negative samples.
+        """
+
+        pos_labels = tf.expand_dims(target_labels, -2)
+        neg_labels = self._sample_idxs(tf.shape(target_labels)[0], labels, idxs)
+
+        return tf.cast(
+            tf.reduce_all(tf.equal(neg_labels, pos_labels), axis=-1), pos_labels.dtype
+        )
+
+    def _get_negs(
+        self, embeds: "tf.Tensor", labels: "tf.Tensor", target_labels: "tf.Tensor"
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Get negative examples from given tensor."""
+
+        embeds_flat = self._make_flat(embeds)
+        labels_flat = self._make_flat(labels)
+        target_labels_flat = self._make_flat(target_labels)
+
+        total_candidates = tf.shape(embeds_flat)[0]
+        target_size = tf.shape(target_labels_flat)[0]
+
+        neg_ids = self._random_indices(target_size, total_candidates)
+
+        neg_embeds = self._sample_idxs(target_size, embeds_flat, neg_ids)
+        bad_negs = self._get_bad_mask(labels_flat, target_labels_flat, neg_ids)
+
+        if len(target_labels.shape) == 3:
+            target_shape = tf.shape(target_labels)
+            neg_embeds = tf.reshape(
+                neg_embeds, (target_shape[0], target_shape[1], -1, embeds.shape[-1])
+            )
+            bad_negs = tf.reshape(bad_negs, (target_shape[0], target_shape[1], -1))
+
+        return neg_embeds, bad_negs
+
+    def _sample_negatives(
+        self,
+        inputs_embed: "tf.Tensor",
+        labels_embed: "tf.Tensor",
+        labels: "tf.Tensor",
+        all_labels_embed: "tf.Tensor",
+        all_labels: "tf.Tensor",
+    ) -> Tuple[
+        "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"
+    ]:
+        """Sample negative examples."""
+
+        pos_inputs_embed = tf.expand_dims(inputs_embed, -2)
+        pos_labels_embed = tf.expand_dims(labels_embed, -2)
+
+        # sample negative inputs
+        neg_inputs_embed, inputs_bad_negs = self._get_negs(inputs_embed, labels, labels)
+        # sample negative labels
+        neg_labels_embed, labels_bad_negs = self._get_negs(
+            all_labels_embed, all_labels, labels
+        )
+        return (
+            pos_inputs_embed,
+            pos_labels_embed,
+            neg_inputs_embed,
+            neg_labels_embed,
+            inputs_bad_negs,
+            labels_bad_negs,
+        )
+
+    @staticmethod
+    def sim(
+        a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
+    ) -> "tf.Tensor":
+        """Calculate similarity between given tensors."""
+
+        sim = tf.reduce_sum(a * b, -1)
+        if mask is not None:
+            sim *= tf.expand_dims(mask, 2)
+
+        return sim
+
+    def _train_sim(
+        self,
+        pos_inputs_embed: "tf.Tensor",
+        pos_labels_embed: "tf.Tensor",
+        neg_inputs_embed: "tf.Tensor",
+        neg_labels_embed: "tf.Tensor",
+        inputs_bad_negs: "tf.Tensor",
+        labels_bad_negs: "tf.Tensor",
+        mask: Optional["tf.Tensor"],
+    ) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
+        """Define similarity."""
+
+        # calculate similarity with several
+        # embedded actions for the loss
+        neg_inf = tf.constant(-1e9)
+
+        sim_pos = self.sim(pos_inputs_embed, pos_labels_embed, mask)
+        sim_neg_il = self.sim(pos_inputs_embed, neg_labels_embed, mask) + neg_inf * labels_bad_negs
+        sim_neg_ll = (
+                self.sim(pos_labels_embed, neg_labels_embed, mask) + neg_inf * labels_bad_negs
+        )
+        sim_neg_ii = (
+                self.sim(pos_inputs_embed, neg_inputs_embed, mask) + neg_inf * inputs_bad_negs
+        )
+        sim_neg_li = (
+                self.sim(pos_labels_embed, neg_inputs_embed, mask) + neg_inf * inputs_bad_negs
+        )
+
+        # output similarities between user input and bot actions
+        # and similarities between bot actions and similarities between user inputs
+        return sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
+
+    @staticmethod
+    def _calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
+        """Calculate accuracy"""
+
+        max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
+        return tf.reduce_mean(
+            tf.cast(tf.math.equal(max_all_sim, tf.squeeze(sim_pos, -1)), tf.float32)
+        )
+
+    def _loss_margin(
+        self,
+        sim_pos: "tf.Tensor",
+        sim_neg_il: "tf.Tensor",
+        sim_neg_ll: "tf.Tensor",
+        sim_neg_ii: "tf.Tensor",
+        sim_neg_li: "tf.Tensor",
+        mask: Optional["tf.Tensor"],
+    ) -> "tf.Tensor":
+        """Define max margin loss."""
+
+        # loss for maximizing similarity with correct action
+        loss = tf.maximum(0.0, self.mu_pos - tf.squeeze(sim_pos, -1))
+
+        # loss for minimizing similarity with `num_neg` incorrect actions
+        if self.use_max_sim_neg:
+            # minimize only maximum similarity over incorrect actions
+            max_sim_neg_il = tf.reduce_max(sim_neg_il, -1)
+            loss += tf.maximum(0.0, self.mu_neg + max_sim_neg_il)
+        else:
+            # minimize all similarities with incorrect actions
+            max_margin = tf.maximum(0.0, self.mu_neg + sim_neg_il)
+            loss += tf.reduce_sum(max_margin, -1)
+
+        # penalize max similarity between pos bot and neg bot embeddings
+        max_sim_neg_ll = tf.maximum(0.0, self.mu_neg + tf.reduce_max(sim_neg_ll, -1))
+        loss += max_sim_neg_ll * self.neg_lambda
+
+        # penalize max similarity between pos dial and neg dial embeddings
+        max_sim_neg_ii = tf.maximum(0.0, self.mu_neg + tf.reduce_max(sim_neg_ii, -1))
+        loss += max_sim_neg_ii * self.neg_lambda
+
+        # penalize max similarity between pos bot and neg dial embeddings
+        max_sim_neg_li = tf.maximum(0.0, self.mu_neg + tf.reduce_max(sim_neg_li, -1))
+        loss += max_sim_neg_li * self.neg_lambda
+
+        if mask is not None:
+            # mask loss for different length sequences
+            loss *= mask
+            # average the loss over sequence length
+            loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1)
+
+        # average the loss over the batch
+        loss = tf.reduce_mean(loss)
+
+        return loss
+
+    def _loss_softmax(
+        self,
+        sim_pos: "tf.Tensor",
+        sim_neg_il: "tf.Tensor",
+        sim_neg_ll: "tf.Tensor",
+        sim_neg_ii: "tf.Tensor",
+        sim_neg_li: "tf.Tensor",
+        mask: Optional["tf.Tensor"],
+    ) -> "tf.Tensor":
+        """Define softmax loss."""
+
+        logits = tf.concat(
+            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], -1
+        )
+
+        # create label_ids for softmax
+        label_ids = tf.zeros_like(logits[..., 0], tf.int32)
+
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=label_ids, logits=logits
+        )
+
+        if mask is None:
+            mask = 1.0
+
+        if self.scale_loss:
+            # mask loss by prediction confidence
+            pos_pred = tf.stop_gradient(tf.nn.softmax(logits)[..., 0])
+            scale_mask = mask * tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
+            # scale loss
+            loss *= scale_mask
+
+        if len(loss.shape) == 2:
+            # average over the sequence
+            loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, -1)
+
+        # average the loss over all examples
+        loss = tf.reduce_mean(loss)
+
+        return loss
+
+    @property
+    def _chosen_loss(self) -> Callable:
+        """Use loss depending on given option."""
+
+        if self.loss_type == "margin":
+            return self._loss_margin
+        elif self.loss_type == "softmax":
+            return self._loss_softmax
+        else:
+            raise ValueError(
+                f"Wrong loss type '{self.loss_type}', " f"should be 'margin' or 'softmax'"
+            )
+
+    def call(
+        self,
+        inputs_embed: "tf.Tensor",
+        labels_embed: "tf.Tensor",
+        labels: "tf.Tensor",
+        all_labels_embed: "tf.Tensor",
+        all_labels: "tf.Tensor",
+        mask: Optional["tf.Tensor"] = None,
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        """Calculate loss and accuracy."""
+
+        (
+            pos_inputs_embed,
+            pos_labels_embed,
+            neg_inputs_embed,
+            neg_labels_embed,
+            inputs_bad_negs,
+            labels_bad_negs,
+        ) = self._sample_negatives(
+            inputs_embed, labels_embed, labels, all_labels_embed, all_labels)
+
+        # calculate similarities
+        sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li = self._train_sim(
+            pos_inputs_embed,
+            pos_labels_embed,
+            neg_inputs_embed,
+            neg_labels_embed,
+            inputs_bad_negs,
+            labels_bad_negs,
+            mask,
+        )
+
+        acc = self._calc_accuracy(sim_pos, sim_neg_il)
+
+        loss = self._chosen_loss(
+            sim_pos,
+            sim_neg_il,
+            sim_neg_ll,
+            sim_neg_ii,
+            sim_neg_li,
+            mask,
+        )
+
+        return loss, acc
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index f4fd923fdbea..3d7bd88febf3 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -58,26 +58,25 @@ def fit(self,
         if eager:
             # allows increasing batch size
             train_dataset_func = self.train_dataset
-            eval_dataset_func = self.eval_dataset
-
             train_on_batch_func = self.train_on_batch
         else:
             # allows increasing batch size
             train_dataset_func = tf.function(self.train_dataset)
-            eval_dataset_func = tf.function(self.eval_dataset)
-
             train_on_batch_func = tf.function(
                 self.train_on_batch, input_signature=[train_dataset_func(tf_batch_size).element_spec]
             )
 
         if evaluate_on_num_examples > 0:
             if eager:
+                eval_dataset_func = self.eval_dataset
                 eval_func = self.eval
             else:
+                eval_dataset_func = tf.function(self.eval_dataset)
                 eval_func = tf.function(
                     self.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
                 )
         else:
+            eval_dataset_func = None
             eval_func = None
 
         for ep in pbar:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index fead1198a57a..f5093105f857 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -529,360 +529,6 @@ def append_type(v: np.ndarray):
     return tuple(shapes), tuple(types)
 
 
-def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":
-    """Make tensor 2D."""
-
-    return tf.reshape(x, (-1, x.shape[-1]))
-
-
-def _tf_sample_neg(
-    batch_size: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
-) -> "tf.Tensor":
-    """Sample negative examples for given indices"""
-
-    tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
-
-    return tf.gather(tiled_all_bs, neg_ids, batch_dims=-1)
-
-
-def _tf_get_bad_mask(
-    pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
-) -> "tf.Tensor":
-    """Calculate bad mask for given indices.
-
-    Checks that input features are different for positive negative samples.
-    """
-
-    pos_b_in_flat = tf.expand_dims(pos_b, -2)
-    neg_b_in_flat = _tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
-
-    return tf.cast(
-        tf.reduce_all(tf.equal(neg_b_in_flat, pos_b_in_flat), axis=-1),
-        pos_b_in_flat.dtype,
-    )
-
-
-def _tf_get_negs(
-    all_embed: "tf.Tensor", all_raw: "tf.Tensor", raw_pos: "tf.Tensor", num_neg: int
-) -> Tuple["tf.Tensor", "tf.Tensor"]:
-    """Get negative examples from given tensor."""
-
-    if len(raw_pos.shape) == 3:
-        batch_size = tf.shape(raw_pos)[0]
-        seq_length = tf.shape(raw_pos)[1]
-    else:  # len(raw_pos.shape) == 2
-        batch_size = tf.shape(raw_pos)[0]
-        seq_length = 1
-
-    raw_flat = _tf_make_flat(raw_pos)
-
-    total_candidates = tf.shape(all_embed)[0]
-
-    all_indices = tf.tile(
-        tf.expand_dims(tf.range(0, total_candidates, 1), 0),
-        (batch_size * seq_length, 1),
-    )
-    shuffled_indices = tf.transpose(
-        tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
-    )
-    neg_ids = shuffled_indices[:, :num_neg]
-
-    bad_negs = _tf_get_bad_mask(raw_flat, all_raw, neg_ids)
-    if len(raw_pos.shape) == 3:
-        bad_negs = tf.reshape(bad_negs, (batch_size, seq_length, -1))
-
-    neg_embed = _tf_sample_neg(batch_size * seq_length, all_embed, neg_ids)
-    if len(raw_pos.shape) == 3:
-        neg_embed = tf.reshape(
-            neg_embed, (batch_size, seq_length, -1, all_embed.shape[-1])
-        )
-
-    return neg_embed, bad_negs
-
-
-def _sample_negatives(
-    a_embed: "tf.Tensor",
-    b_embed: "tf.Tensor",
-    b_raw: "tf.Tensor",
-    all_b_embed: "tf.Tensor",
-    all_b_raw: "tf.Tensor",
-    num_neg: int,
-) -> Tuple[
-    "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"
-]:
-    """Sample negative examples."""
-
-    neg_dial_embed, dial_bad_negs = _tf_get_negs(
-        _tf_make_flat(a_embed), _tf_make_flat(b_raw), b_raw, num_neg
-    )
-
-    neg_bot_embed, bot_bad_negs = _tf_get_negs(
-        _tf_make_flat(all_b_embed), _tf_make_flat(all_b_raw), b_raw, num_neg
-    )
-    return (
-        tf.expand_dims(a_embed, -2),
-        tf.expand_dims(b_embed, -2),
-        neg_dial_embed,
-        neg_bot_embed,
-        dial_bad_negs,
-        bot_bad_negs,
-    )
-
-
-def tf_raw_sim(
-    a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
-) -> "tf.Tensor":
-    """Calculate similarity between given tensors."""
-
-    sim = tf.reduce_sum(a * b, -1)
-    if mask is not None:
-        sim *= tf.expand_dims(mask, 2)
-
-    return sim
-
-
-def _tf_sim(
-    pos_dial_embed: "tf.Tensor",
-    pos_bot_embed: "tf.Tensor",
-    neg_dial_embed: "tf.Tensor",
-    neg_bot_embed: "tf.Tensor",
-    dial_bad_negs: "tf.Tensor",
-    bot_bad_negs: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
-    """Define similarity."""
-
-    # calculate similarity with several
-    # embedded actions for the loss
-    neg_inf = -1e9  # large_compatible_negative(pos_dial_embed.dtype)
-
-    sim_pos = tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
-    sim_neg = tf_raw_sim(pos_dial_embed, neg_bot_embed, mask) + neg_inf * bot_bad_negs
-    sim_neg_bot_bot = (
-        tf_raw_sim(pos_bot_embed, neg_bot_embed, mask) + neg_inf * bot_bad_negs
-    )
-    sim_neg_dial_dial = (
-        tf_raw_sim(pos_dial_embed, neg_dial_embed, mask) + neg_inf * dial_bad_negs
-    )
-    sim_neg_bot_dial = (
-        tf_raw_sim(pos_bot_embed, neg_dial_embed, mask) + neg_inf * dial_bad_negs
-    )
-
-    # output similarities between user input and bot actions
-    # and similarities between bot actions and similarities between user inputs
-    return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
-
-
-def _tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
-    """Calculate accuracy"""
-
-    max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
-    return tf.reduce_mean(
-        tf.cast(tf.math.equal(max_all_sim, tf.squeeze(sim_pos, -1)), tf.float32)
-    )
-
-
-# noinspection PyPep8Naming
-def _tf_loss_margin(
-    sim_pos: "tf.Tensor",
-    sim_neg: "tf.Tensor",
-    sim_neg_bot_bot: "tf.Tensor",
-    sim_neg_dial_dial: "tf.Tensor",
-    sim_neg_bot_dial: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-    mu_pos: float,
-    mu_neg: float,
-    use_max_sim_neg: bool,
-    C_emb: float,
-) -> "tf.Tensor":
-    """Define max margin loss."""
-
-    # loss for maximizing similarity with correct action
-    loss = tf.maximum(0.0, mu_pos - tf.squeeze(sim_pos, -1))
-
-    # loss for minimizing similarity with `num_neg` incorrect actions
-    if use_max_sim_neg:
-        # minimize only maximum similarity over incorrect actions
-        max_sim_neg = tf.reduce_max(sim_neg, -1)
-        loss += tf.maximum(0.0, mu_neg + max_sim_neg)
-    else:
-        # minimize all similarities with incorrect actions
-        max_margin = tf.maximum(0.0, mu_neg + sim_neg)
-        loss += tf.reduce_sum(max_margin, -1)
-
-    # penalize max similarity between pos bot and neg bot embeddings
-    max_sim_neg_bot = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_bot, -1))
-    loss += max_sim_neg_bot * C_emb
-
-    # penalize max similarity between pos dial and neg dial embeddings
-    max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_dial_dial, -1))
-    loss += max_sim_neg_dial * C_emb
-
-    # penalize max similarity between pos bot and neg dial embeddings
-    max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_dial, -1))
-    loss += max_sim_neg_dial * C_emb
-
-    if mask is not None:
-        # mask loss for different length sequences
-        loss *= mask
-        # average the loss over sequence length
-        loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1)
-
-    # average the loss over the batch
-    loss = tf.reduce_mean(loss)
-
-    return loss
-
-
-def _tf_loss_softmax(
-    sim_pos: "tf.Tensor",
-    sim_neg: "tf.Tensor",
-    sim_neg_bot_bot: "tf.Tensor",
-    sim_neg_dial_dial: "tf.Tensor",
-    sim_neg_bot_dial: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-    scale_loss: bool,
-) -> "tf.Tensor":
-    """Define softmax loss."""
-
-    logits = tf.concat(
-        [sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial], -1
-    )
-
-    # create label_ids for softmax
-    label_ids = tf.zeros_like(logits[..., 0], tf.int32)
-
-    if mask is None:
-        mask = 1.0
-
-    if scale_loss:
-        # mask loss by prediction confidence
-        pos_pred = tf.stop_gradient(tf.nn.softmax(logits)[..., 0])
-        scale_mask = mask * tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
-    else:
-        scale_mask = mask
-
-    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        labels=label_ids, logits=logits
-    )
-
-    # scale loss
-    if len(loss.shape) == 2:
-        # average over the sequence
-        loss = tf.reduce_sum(loss * scale_mask, -1) / tf.reduce_sum(mask, -1)
-    else:
-        loss *= scale_mask
-
-    # average the loss over all examples
-    loss = tf.reduce_mean(loss)
-
-    return loss
-
-
-# noinspection PyPep8Naming
-def _choose_loss(
-    sim_pos: "tf.Tensor",
-    sim_neg: "tf.Tensor",
-    sim_neg_bot_bot: "tf.Tensor",
-    sim_neg_dial_dial: "tf.Tensor",
-    sim_neg_bot_dial: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-    loss_type: Text,
-    mu_pos: float,
-    mu_neg: float,
-    use_max_sim_neg: bool,
-    C_emb: float,
-    scale_loss: bool,
-) -> "tf.Tensor":
-    """Use loss depending on given option."""
-
-    if loss_type == "margin":
-        return _tf_loss_margin(
-            sim_pos,
-            sim_neg,
-            sim_neg_bot_bot,
-            sim_neg_dial_dial,
-            sim_neg_bot_dial,
-            mask,
-            mu_pos,
-            mu_neg,
-            use_max_sim_neg,
-            C_emb,
-        )
-    elif loss_type == "softmax":
-        return _tf_loss_softmax(
-            sim_pos,
-            sim_neg,
-            sim_neg_bot_bot,
-            sim_neg_dial_dial,
-            sim_neg_bot_dial,
-            mask,
-            scale_loss,
-        )
-    else:
-        raise ValueError(
-            f"Wrong loss type '{loss_type}', " f"should be 'margin' or 'softmax'"
-        )
-
-
-# noinspection PyPep8Naming
-def calculate_loss_acc(
-    a_embed: "tf.Tensor",
-    b_embed: "tf.Tensor",
-    b_raw: "tf.Tensor",
-    all_b_embed: "tf.Tensor",
-    all_b_raw: "tf.Tensor",
-    num_neg: int,
-    mask: Optional["tf.Tensor"],
-    loss_type: Text,
-    mu_pos: float,
-    mu_neg: float,
-    use_max_sim_neg: bool,
-    C_emb: float,
-    scale_loss: bool,
-) -> Tuple["tf.Tensor", "tf.Tensor"]:
-    """Calculate loss and accuracy."""
-
-    (
-        pos_dial_embed,
-        pos_bot_embed,
-        neg_dial_embed,
-        neg_bot_embed,
-        dial_bad_negs,
-        bot_bad_negs,
-    ) = _sample_negatives(a_embed, b_embed, b_raw, all_b_embed, all_b_raw, num_neg)
-
-    # calculate similarities
-    (sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial) = _tf_sim(
-        pos_dial_embed,
-        pos_bot_embed,
-        neg_dial_embed,
-        neg_bot_embed,
-        dial_bad_negs,
-        bot_bad_negs,
-        mask,
-    )
-
-    acc = _tf_calc_accuracy(sim_pos, sim_neg)
-
-    loss = _choose_loss(
-        sim_pos,
-        sim_neg,
-        sim_neg_bot_bot,
-        sim_neg_dial_dial,
-        sim_neg_bot_dial,
-        mask,
-        loss_type,
-        mu_pos,
-        mu_neg,
-        use_max_sim_neg,
-        C_emb,
-        scale_loss,
-    )
-
-    return loss, acc
-
-
 def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
     if similarity_type == "cosine":
         # clip negative values to zero

From 03088efe772cc5d9ec2103d936375207f1d19563 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 11:19:31 +0100
Subject: [PATCH 103/633] replace parameters with constants

---
 .../embedding_intent_classifier.py            | 287 ++++++++++--------
 1 file changed, 158 insertions(+), 129 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 5184efd41aa0..8f25c338e6d0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -42,6 +42,40 @@
 
 shapes, types = None, None
 
+# constants
+HIDDEN_LAYERS_SIZES_TEXT = "hidden_layers_sizes_text"
+HIDDEN_LAYERS_SIZES_LABEL = "hidden_layers_sizes_label"
+SHARE_HIDDEN_LAYERS = "share_hidden_layers"
+TRANSFORMER_SIZE = "transformer_size"
+NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
+NUM_HEADS = "number_of_attention_heads"
+POS_ENCODING = "positional_encoding"
+MAX_SEQ_LENGTH = "maximum_sequence_length"
+BATCH_SIZES = "batch_sizes"
+BATCH_STRATEGY = "batch_strategy"
+EPOCHS = "epochs"
+RANDOM_SEED = "random_seed"
+LEARNING_RATE = "learning_rate"
+DENSE_DIM = "dense_dimensions"
+EMBED_DIM = "embedding_dimension"
+NUM_NEG = "number_of_negative_examples"
+SIMILARITY_TYPE = "similarity_type"
+LOSS_TYPE = "loss_type"
+MU_POS = "maximum_positive_similarity"
+MU_NEG = "maximum_negative_similarity"
+USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
+SCALE_LOSS = "scale_loss"
+C2 = "l2_regularization"
+C_EMB = "c_emb"
+DROPRATE = "droprate"
+UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
+EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
+EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
+INTENT_CLASSIFICATION = "perform_intent_classification"
+ENTITY_RECOGNITION = "perform_entity_recognition"
+MASKED_LM = "use_masked_language_model"
+SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
+
 
 class EmbeddingIntentClassifier(EntityExtractor):
     """label classifier using supervised embeddings.
@@ -75,101 +109,99 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_text": [],
+        HIDDEN_LAYERS_SIZES_TEXT: [],
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_label": [],
+        HIDDEN_LAYERS_SIZES_LABEL: [],
         # Whether to share the hidden layer weights between input words and labels
-        "share_hidden_layers": False,
+        SHARE_HIDDEN_LAYERS: False,
         # number of units in transformer
-        "transformer_size": 256,
+        TRANSFORMER_SIZE: 256,
         # number of transformer layers
-        "num_transformer_layers": 2,
+        NUM_TRANSFORMER_LAYERS: 2,
         # number of attention heads in transformer
-        "num_heads": 4,
+        NUM_HEADS: 4,
         # type of positional encoding in transformer
-        "pos_encoding": "timing",  # string 'timing' or 'emb'
+        POS_ENCODING: "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
-        "max_seq_length": 256,
+        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
-        "batch_size": [64, 256],
+        BATCH_SIZES: [64, 256],
         # how to create batches
-        "batch_strategy": "balanced",  # string 'sequence' or 'balanced'
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
         # number of epochs
-        "epochs": 300,
+        EPOCHS: 300,
         # set random seed to any int to get reproducible results
-        "random_seed": None,
+        RANDOM_SEED: None,
         # optimizer
-        "learning_rate": 0.001,
+        LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        "dense_dim": {"text": 512, "label": 20},
+        DENSE_DIM: {"text": 512, "label": 20},
         # dimension size of embedding vectors
-        "embed_dim": 20,
+        EMBED_DIM: 20,
         # the type of the similarity
-        "num_neg": 20,
+        NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect actions
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
         # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
+        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
-        "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect labels
-        "mu_neg": -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
         # flag: if true, only minimize the maximum similarity for incorrect labels
-        "use_max_sim_neg": True,
+        USE_MAX_SIM_NEG: True,
         # scale loss inverse proportionally to confidence of correct prediction
-        "scale_loss": True,
+        SCALE_LOSS: True,
         # regularization parameters
         # the scale of L2 regularization
-        "C2": 0.002,
+        C2: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different labels
-        "C_emb": 0.8,
+        C_EMB: 0.8,
         # dropout rate for rnn
-        "droprate": 0.2,
+        DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        "unidirectional_encoder": True,
+        UNIDIRECTIONAL_ENCODER: True,
         # visualization of accuracy
         # how often to calculate training accuracy
-        "evaluate_every_num_epochs": 20,  # small values may hurt performance
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
-        "evaluate_on_num_examples": 0,  # large values may hurt performance
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
         # model config
         # if true intent classification is trained and intent predicted
-        "intent_classification": True,
+        INTENT_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
-        "named_entity_recognition": True,
-        "masked_lm_loss": False,
-        "sparse_input_dropout": False,
+        ENTITY_RECOGNITION: True,
+        MASKED_LM: False,
+        SPARSE_INPUT_DROPOUT: False,
     }
     # end default properties (DOC MARKER - don't remove)
 
     # init helpers
     def _check_config_parameters(self) -> None:
         if (
-            self.component_config["share_hidden_layers"]
-            and self.component_config["hidden_layers_sizes_text"]
-            != self.component_config["hidden_layers_sizes_label"]
+            self.component_config[SHARE_HIDDEN_LAYERS]
+            and self.component_config[HIDDEN_LAYERS_SIZES_TEXT]
+            != self.component_config[HIDDEN_LAYERS_SIZES_LABEL]
         ):
             raise ValueError(
                 "If hidden layer weights are shared,"
                 "hidden_layer_sizes for text and label must coincide."
             )
 
-        if self.component_config["similarity_type"] == "auto":
-            if self.component_config["loss_type"] == "softmax":
-                self.component_config["similarity_type"] = "inner"
-            elif self.component_config["loss_type"] == "margin":
-                self.component_config["similarity_type"] = "cosine"
+        if self.component_config[SIMILARITY_TYPE] == "auto":
+            if self.component_config[LOSS_TYPE] == "softmax":
+                self.component_config[SIMILARITY_TYPE] = "inner"
+            elif self.component_config[LOSS_TYPE] == "margin":
+                self.component_config[SIMILARITY_TYPE] = "cosine"
 
-        if self.component_config["evaluate_every_num_epochs"] < 1:
-            self.component_config["evaluate_every_num_epochs"] = self.component_config[
-                "epochs"
-            ]
+        if self.component_config[EVAL_NUM_EPOCHS] < 1:
+            self.component_config[EVAL_NUM_EPOCHS] = self.component_config[EPOCHS]
 
     # package safety checks
     @classmethod
@@ -346,7 +378,7 @@ def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
         return num_features
 
     def check_input_dimension_consistency(self, session_data: "SessionDataType"):
-        if self.component_config["share_hidden_layers"]:
+        if self.component_config[SHARE_HIDDEN_LAYERS]:
             num_text_features = self._get_num_of_features(session_data, "text_features")
             num_intent_features = self._get_num_of_features(
                 session_data, "label_features"
@@ -494,7 +526,7 @@ def _create_session_data(
                 if label_id_dict:
                     label_ids.append(label_id_dict[e.get(label_attribute)])
 
-            if self.component_config["named_entity_recognition"] and tag_id_dict:
+            if self.component_config[ENTITY_RECOGNITION] and tag_id_dict:
                 _tags = []
                 for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
                     _tag = determine_token_labels(t, e.get(ENTITIES_ATTRIBUTE), None)
@@ -576,11 +608,11 @@ def train(
         logger.debug("Started training embedding classifier.")
 
         # set numpy random seed
-        np.random.seed(self.component_config["random_seed"])
+        np.random.seed(self.component_config[RANDOM_SEED])
 
         session_data = self.preprocess_train_data(training_data)
 
-        if self.component_config["intent_classification"]:
+        if self.component_config[INTENT_CLASSIFICATION]:
             possible_to_train = self._check_enough_labels(session_data)
 
             if not possible_to_train:
@@ -591,11 +623,11 @@ def train(
                 )
                 return
 
-        if self.component_config["evaluate_on_num_examples"]:
+        if self.component_config[EVAL_NUM_EXAMPLES]:
             session_data, eval_session_data = train_utils.train_val_split(
                 session_data,
-                self.component_config["evaluate_on_num_examples"],
-                self.component_config["random_seed"],
+                self.component_config[EVAL_NUM_EXAMPLES],
+                self.component_config[RANDOM_SEED],
                 label_key="label_ids",
             )
         else:
@@ -603,7 +635,7 @@ def train(
 
         # TODO set it in the model
         # set random seed
-        tf.random.set_seed(self.component_config["random_seed"])
+        tf.random.set_seed(self.component_config[RANDOM_SEED])
 
         self.model = DIET(
             session_data,
@@ -614,10 +646,10 @@ def train(
         )
 
         self.model.fit(
-            self.component_config["epochs"],
-            self.component_config["batch_size"],
-            self.component_config["evaluate_on_num_examples"],
-            self.component_config["evaluate_every_num_epochs"],
+            self.component_config[EPOCHS],
+            self.component_config[BATCH_SIZES],
+            self.component_config[EVAL_NUM_EXAMPLES],
+            self.component_config[EVAL_NUM_EPOCHS],
         )
 
         # rebuild the graph for prediction
@@ -743,13 +775,13 @@ def process(self, message: "Message", **kwargs: Any) -> None:
 
         out = self._predict(message)
 
-        if self.component_config["intent_classification"]:
+        if self.component_config[INTENT_CLASSIFICATION]:
             label, label_ranking = self._predict_label(out)
 
             message.set("label", label, add_to_output=True)
             message.set("label_ranking", label_ranking, add_to_output=True)
 
-        if self.component_config["named_entity_recognition"]:
+        if self.component_config[ENTITY_RECOGNITION]:
             entities = self._predict_entities(out, message)
 
             message.set("entities", entities, add_to_output=True)
@@ -851,11 +883,11 @@ def load(
         ) as f:
             batch_tuple_sizes = pickle.load(f)
 
-        if meta["similarity_type"] == "auto":
-            if meta["loss_type"] == "softmax":
-                meta["similarity_type"] = "inner"
-            elif meta["loss_type"] == "margin":
-                meta["similarity_type"] = "cosine"
+        if meta[SIMILARITY_TYPE] == "auto":
+            if meta[LOSS_TYPE] == "softmax":
+                meta[SIMILARITY_TYPE] = "inner"
+            elif meta[LOSS_TYPE] == "margin":
+                meta[SIMILARITY_TYPE] = "cosine"
 
         model = DIET(dummy_session_data, None, label_data, inv_tag_dict, meta)
 
@@ -943,7 +975,7 @@ def __init__(
         self.training = tf.ones((), tf.bool)
 
         # tf training
-        self._optimizer = tf.keras.optimizers.Adam(config["learning_rate"])
+        self._optimizer = tf.keras.optimizers.Adam(config[LEARNING_RATE])
         self.entity_f1 = tfa.metrics.F1Score(
             num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
             average="micro",
@@ -954,48 +986,48 @@ def __init__(
         self.batch_tuple_sizes = None
 
     def _prepare_layers(self, session_data: SessionDataType):
-        self._sparse_dropout = tf_layers.SparseDropout(rate=self.config["droprate"])
+        self._sparse_dropout = tf_layers.SparseDropout(rate=self.config[DROPRATE])
 
         self._sparse_to_dense = {
             "text": self._create_sparse_dense_layer(
                 session_data["text_features"],
                 "text",
-                self.config["C2"],
-                self.config["dense_dim"]["text"],
+                self.config[C2],
+                self.config[DENSE_DIM]["text"],
             ),
             "label": self._create_sparse_dense_layer(
                 session_data["label_features"],
                 "label",
-                self.config["C2"],
-                self.config["dense_dim"]["label"],
+                self.config[C2],
+                self.config[DENSE_DIM]["label"],
             ),
         }
 
         self._ffnn = {
             "text": tf_layers.ReluFfn(
-                self.config["hidden_layers_sizes_text"],
-                self.config["droprate"],
-                self.config["C2"],
-                "text_intent" if self.config["share_hidden_layers"] else "text",
+                self.config[HIDDEN_LAYERS_SIZES_TEXT],
+                self.config[DROPRATE],
+                self.config[C2],
+                "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
             ),
             "label": tf_layers.ReluFfn(
-                self.config["hidden_layers_sizes_label"],
-                self.config["droprate"],
-                self.config["C2"],
-                "text_intent" if self.config["share_hidden_layers"] else "label",
+                self.config[HIDDEN_LAYERS_SIZES_LABEL],
+                self.config[DROPRATE],
+                self.config[C2],
+                "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
             ),
         }
 
-        if self.config["num_transformer_layers"] > 0:
+        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
             self._transformer = tf_layers.TransformerEncoder(
-                self.config["num_transformer_layers"],
-                self.config["transformer_size"],
-                self.config["num_heads"],
-                self.config["transformer_size"] * 4,
-                self.config["max_seq_length"],
-                self.config["C2"],
-                self.config["droprate"],
-                self.config["unidirectional_encoder"],
+                self.config[NUM_TRANSFORMER_LAYERS],
+                self.config[TRANSFORMER_SIZE],
+                self.config[NUM_HEADS],
+                self.config[TRANSFORMER_SIZE] * 4,
+                self.config[MAX_SEQ_LENGTH],
+                self.config[C2],
+                self.config[DROPRATE],
+                self.config[UNIDIRECTIONAL_ENCODER],
                 name="text_encoder",
             )
         else:
@@ -1007,37 +1039,37 @@ def _prepare_layers(self, session_data: SessionDataType):
         self.eval_metrics = {"val_t_loss": tf.keras.metrics.Mean(name="val_t_loss")}
 
         self._input_mask = None
-        if self.config["masked_lm_loss"]:
+        if self.config[MASKED_LM]:
             self._input_mask = tf_layers.InputMask()
             self._embed["text_mask"] = tf_layers.Embed(
-                self.config["embed_dim"],
-                self.config["C2"],
+                self.config[EMBED_DIM],
+                self.config[C2],
                 "text_mask",
-                self.config["similarity_type"],
+                self.config[SIMILARITY_TYPE],
             )
             self._embed["text_token"] = tf_layers.Embed(
-                self.config["embed_dim"],
-                self.config["C2"],
+                self.config[EMBED_DIM],
+                self.config[C2],
                 "text_token",
-                self.config["similarity_type"],
+                self.config[SIMILARITY_TYPE],
             )
             self.train_metrics["m_loss"] = tf.keras.metrics.Mean(name="m_loss")
             self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
             self.eval_metrics["val_m_loss"] = tf.keras.metrics.Mean(name="val_m_loss")
             self.eval_metrics["val_m_acc"] = tf.keras.metrics.Mean(name="val_m_acc")
 
-        if self.config["intent_classification"]:
+        if self.config[INTENT_CLASSIFICATION]:
             self._embed["text"] = tf_layers.Embed(
-                self.config["embed_dim"],
-                self.config["C2"],
+                self.config[EMBED_DIM],
+                self.config[C2],
                 "text",
-                self.config["similarity_type"],
+                self.config[SIMILARITY_TYPE],
             )
             self._embed["label"] = tf_layers.Embed(
-                self.config["embed_dim"],
-                self.config["C2"],
+                self.config[EMBED_DIM],
+                self.config[C2],
                 "label",
-                self.config["similarity_type"],
+                self.config[SIMILARITY_TYPE],
             )
             self.train_metrics["i_loss"] = tf.keras.metrics.Mean(name="i_loss")
             self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
@@ -1045,11 +1077,11 @@ def _prepare_layers(self, session_data: SessionDataType):
             self.eval_metrics["val_i_acc"] = tf.keras.metrics.Mean(name="val_i_acc")
 
         self._crf = None
-        if self.config["named_entity_recognition"]:
+        if self.config[ENTITY_RECOGNITION]:
             self._embed["logits"] = tf_layers.Embed(
-                self._num_tags, self.config["C2"], "logits"
+                self._num_tags, self.config[C2], "logits"
             )
-            self._crf = tf_layers.CRF(self._num_tags, self.config["C2"])
+            self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
             self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
             self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
             self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
@@ -1102,7 +1134,7 @@ def _create_sequence(
         masked_lm_loss: bool = False,
     ):
         x = self._combine_sparse_dense_features(
-            features, mask, name, sparse_dropout=self.config["sparse_input_dropout"]
+            features, mask, name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT]
         )
 
         if masked_lm_loss:
@@ -1138,14 +1170,14 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
             a_masked,
             a_embed,
             a,
-            self.config["num_neg"],
+            self.config[NUM_NEG],
             None,
-            self.config["loss_type"],
-            self.config["mu_pos"],
-            self.config["mu_neg"],
-            self.config["use_max_sim_neg"],
-            self.config["C_emb"],
-            self.config["scale_loss"],
+            self.config[LOSS_TYPE],
+            self.config[MU_POS],
+            self.config[MU_NEG],
+            self.config[USE_MAX_SIM_NEG],
+            self.config[C_EMB],
+            self.config[SCALE_LOSS],
         )
 
     def _build_all_b(self):
@@ -1170,14 +1202,14 @@ def _intent_loss(self, a, b):
             b,
             all_labels_embed,
             all_labels,
-            self.config["num_neg"],
+            self.config[NUM_NEG],
             None,
-            self.config["loss_type"],
-            self.config["mu_pos"],
-            self.config["mu_neg"],
-            self.config["use_max_sim_neg"],
-            self.config["C_emb"],
-            self.config["scale_loss"],
+            self.config[LOSS_TYPE],
+            self.config[MU_POS],
+            self.config[MU_NEG],
+            self.config[USE_MAX_SIM_NEG],
+            self.config[C_EMB],
+            self.config[SCALE_LOSS],
         )
 
     def _entity_loss(
@@ -1221,23 +1253,20 @@ def _train_losses_scores(self, batch_in):
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
 
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
-            tf_batch_data["text_features"],
-            mask_text,
-            "text",
-            self.config["masked_lm_loss"],
+            tf_batch_data["text_features"], mask_text, "text", self.config[MASKED_LM]
         )
 
         losses = {}
         scores = {}
 
-        if self.config["masked_lm_loss"]:
+        if self.config[MASKED_LM]:
             loss, acc = self._mask_loss(
                 text_transformed, text_in, lm_mask_bool_text, "text"
             )
             losses["m_loss"] = loss
             scores["m_acc"] = acc
 
-        if self.config["intent_classification"]:
+        if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
             last_index = tf.maximum(
                 tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
@@ -1252,7 +1281,7 @@ def _train_losses_scores(self, batch_in):
             losses["i_loss"] = loss
             scores["i_acc"] = acc
 
-        if self.config["named_entity_recognition"]:
+        if self.config[ENTITY_RECOGNITION]:
             tags = tf_batch_data["tag_ids"][0]
 
             loss, f1 = self._entity_loss(
@@ -1284,7 +1313,7 @@ def train_dataset(self, batch_size):
             self.session_data,
             batch_size,
             label_key="label_ids",
-            batch_strategy=self.config["batch_strategy"],
+            batch_strategy=self.config[BATCH_STRATEGY],
             shuffle=True,
         )
 
@@ -1321,7 +1350,7 @@ def predict(self, batch_in):
         )
 
         out = {}
-        if self.config["intent_classification"]:
+        if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
             last_index = tf.maximum(
                 tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
@@ -1346,11 +1375,11 @@ def predict(self, batch_in):
             # )
 
             scores = train_utils.confidence_from_sim(
-                sim_all, self.config["similarity_type"]
+                sim_all, self.config[SIMILARITY_TYPE]
             )
             out["i_scores"] = scores
 
-        if self.config["named_entity_recognition"]:
+        if self.config[ENTITY_RECOGNITION]:
             sequence_lengths = sequence_lengths - 1
             logits = self._embed["logits"](text_transformed)
             pred_ids = self._crf(logits, sequence_lengths)

From 95700d9efce9a4fa71f1cd2c764a60b078248d43 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 11:29:16 +0100
Subject: [PATCH 104/633] Use constants in response selector

---
 .../selectors/embedding_response_selector.py  | 104 ++++++++++++------
 1 file changed, 69 insertions(+), 35 deletions(-)

diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index b08dcdde7010..7bf71d1b130f 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -3,7 +3,41 @@
 from typing import Any, Dict, Text
 
 from rasa.nlu.components import any_of
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.classifiers.embedding_intent_classifier import (
+    EmbeddingIntentClassifier,
+    USE_MAX_SIM_NEG,
+    HIDDEN_LAYERS_SIZES_TEXT,
+    HIDDEN_LAYERS_SIZES_LABEL,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    POS_ENCODING,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIM,
+    EMBED_DIM,
+    NUM_NEG,
+    SIMILARITY_TYPE,
+    LOSS_TYPE,
+    MU_POS,
+    MU_NEG,
+    SCALE_LOSS,
+    C2,
+    C_EMB,
+    DROPRATE,
+    UNIDIRECTIONAL_ENCODER,
+    EVAL_NUM_EPOCHS,
+    EVAL_NUM_EXAMPLES,
+    INTENT_CLASSIFICATION,
+    ENTITY_RECOGNITION,
+    MASKED_LM,
+    SPARSE_INPUT_DROPOUT,
+)
 from rasa.nlu.constants import (
     RESPONSE_ATTRIBUTE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -55,81 +89,81 @@ class ResponseSelector(EmbeddingIntentClassifier):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_a": [256, 128],
+        HIDDEN_LAYERS_SIZES_TEXT: [256, 128],
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_b": [256, 128],
-        # sizes of hidden layers before the embedding layer for tag labels
-        # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_c": [],
+        HIDDEN_LAYERS_SIZES_LABEL: [256, 128],
         # Whether to share the hidden layer weights between input words and intent labels
-        "share_hidden_layers": False,
+        SHARE_HIDDEN_LAYERS: False,
         # number of units in transformer
-        "transformer_size": 128,
+        TRANSFORMER_SIZE: 128,
         # number of transformer layers
-        "num_transformer_layers": 1,
+        NUM_TRANSFORMER_LAYERS: 1,
         # number of attention heads in transformer
-        "num_heads": 4,
+        NUM_HEADS: 4,
         # type of positional encoding in transformer
-        "pos_encoding": "timing",  # string 'timing' or 'emb'
+        POS_ENCODING: "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
-        "max_seq_length": 256,
+        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
-        "batch_size": [64, 256],
+        BATCH_SIZES: [64, 256],
         # how to create batches
-        "batch_strategy": "balanced",  # string 'sequence' or 'balanced'
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
         # number of epochs
-        "epochs": 300,
+        EPOCHS: 300,
         # set random seed to any int to get reproducible results
-        "random_seed": None,
+        RANDOM_SEED: None,
+        # optimizer
+        LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        "dense_dim": {"text": 512, "label": 20},
+        DENSE_DIM: {"text": 512, "label": 20},
         # dimension size of embedding vectors
-        "embed_dim": 20,
+        EMBED_DIM: 20,
         # the type of the similarity
-        "num_neg": 20,
+        NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect actions
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
         # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
+        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
         # how similar the algorithm should try
         # to make embedding vectors for correct intent labels
-        "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect intent labels
-        "mu_neg": -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
         # flag: if true, only minimize the maximum similarity for
         # incorrect intent labels
-        "use_max_sim_neg": True,
+        USE_MAX_SIM_NEG: True,
         # scale loss inverse proportionally to confidence of correct prediction
-        "scale_loss": True,
+        SCALE_LOSS: True,
         # regularization parameters
         # the scale of L2 regularization
-        "C2": 0.002,
+        C2: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different intent labels
-        "C_emb": 0.8,
+        C_EMB: 0.8,
         # dropout rate for rnn
-        "droprate": 0.2,
+        DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        "unidirectional_encoder": True,
+        UNIDIRECTIONAL_ENCODER: True,
         # visualization of accuracy
         # how often to calculate training accuracy
-        "evaluate_every_num_epochs": 20,  # small values may hurt performance
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
-        "evaluate_on_num_examples": 0,  # large values may hurt performance,
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
         # if true intent classification is trained and intent predicted
-        "intent_classification": True,
+        INTENT_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
-        "named_entity_recognition": False,
-        # number of entity tags
-        "num_tags": 0,
+        ENTITY_RECOGNITION: False,
+        MASKED_LM: False,
+        SPARSE_INPUT_DROPOUT: False,
     }
+
     # end default properties (DOC MARKER - don't remove)
 
     def _load_selector_params(self, config: Dict[Text, Any]):

From 0a4e3bcd10db8d6f76d973ee2d953115988fe311 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 11:59:18 +0100
Subject: [PATCH 105/633] update requirements

---
 requirements.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 271924da2522..20fb949c5110 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ jsonpickle==1.1
 redis==3.3.5
 pymongo[tls,srv]==3.8.0
 numpy==1.16.3
-scipy==1.2.1
+scipy==1.4.1
 #tensorflow==1.15.0
 absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
@@ -64,6 +64,4 @@ gast==0.2.2
 # for new featurizers
 tensorflow==2.1.0
 tensorflow_hub==0.6.0
-#tensorflow_text[no-deps]==0.1.0
-# to calculate f1 score in new architecture
-git+https://github.com/guillaumegenthial/tf_metrics.git
+tensorflow-addons==0.7.0

From 00a8bcac8b7fafae928da9006a8ef65fb3bc2b21 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 12:14:11 +0100
Subject: [PATCH 106/633] make mask loss more efficient

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 10 ++++------
 rasa/utils/tf_layers.py                             | 12 +++---------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fb59ef01b194..7e070c61bf5b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1398,16 +1398,14 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
         a_masked = tf.boolean_mask(a, lm_mask_bool)
 
         a_t_masked_embed = self._embed[f"{name}_mask"](a_t_masked)
-        a_embed = self._embed[f"{name}_token"](a)
-
-        a_embed_masked = tf.boolean_mask(a_embed, lm_mask_bool)
+        a_masked_embed = self._embed[f"{name}_token"](a_masked)
 
         return self._loss_mask(
             a_t_masked_embed,
-            a_embed_masked,
+            a_masked_embed,
+            a_masked,
+            a_masked_embed,
             a_masked,
-            a_embed,
-            a,
         )
 
     def _build_all_b(self):
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index bf601b3edb7e..4da49c184277 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -484,21 +484,14 @@ def _make_flat(x: "tf.Tensor") -> "tf.Tensor":
 
     def _random_indices(self, batch_size: "tf.Tensor", total_candidates: "tf.Tensor"):
 
-        # all_indices = tf.tile(
-        #     tf.expand_dims(tf.range(total_candidates), 0),
-        #     (batch_size, 1),
-        # )
-        # shuffled_indices = tf.transpose(
-        #     tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
-        # )
-        # return shuffled_indices[:, :self.num_neg]
-
         def rand_idxs():
             """Create random tensor of indices"""
             # (1, num_neg)
             return tf.expand_dims(
                 tf.random.shuffle(tf.range(total_candidates))[:self.num_neg], 0)
 
+        # return tf.tile(rand_idxs(), (batch_size, 1))
+
         def cond(i, out):
             """Condition for while loop"""
             return i < batch_size
@@ -522,6 +515,7 @@ def body(i, out):
             body,
             loop_vars=[i1, out1],
             shape_invariants=[i1.shape, tf.TensorShape([None, self.num_neg])],
+            parallel_iterations=1000,
             back_prop=False,
         )[1]
 

From dcb7402a606791ead53cd2dde6991a47c42b6858 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 12:29:49 +0100
Subject: [PATCH 107/633] fix provide for embeddingintentclassifier

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7e070c61bf5b..78338552c977 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -69,7 +69,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
     and additional hidden layers are added together with dropout.
     """
 
-    provides = ["label", "label_ranking", "entities"]
+    provides = ["intent", "intent_ranking", "entities"]
 
     requires = [
         any_of(

From 9d8e472d97ee78a1498a83d90f8846f9d62552cc Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 12:39:29 +0100
Subject: [PATCH 108/633] fix intent prediction

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 8 +++-----
 rasa/utils/tf_layers.py                             | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 78338552c977..6494a672f4d3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -922,8 +922,8 @@ def process(self, message: "Message", **kwargs: Any) -> None:
         if self.intent_classification:
             label, label_ranking = self._predict_label(out)
 
-            message.set("label", label, add_to_output=True)
-            message.set("label_ranking", label_ranking, add_to_output=True)
+            message.set("intent", label, add_to_output=True)
+            message.set("intent_ranking", label_ranking, add_to_output=True)
 
         if self.named_entity_recognition:
             entities = self._predict_entities(out, message)
@@ -1588,7 +1588,6 @@ def predict(self, batch_in):
             sim_all = self._loss_label.sim(
                 cls_embed[:, tf.newaxis, :],
                 self.all_labels_embed[tf.newaxis, :, :],
-                None,
             )
             # label = self._create_bow(
             #     tf_batch_data["label_features"],
@@ -1606,9 +1605,8 @@ def predict(self, batch_in):
             out["i_scores"] = scores
 
         if self._named_entity_recognition:
-            sequence_lengths = sequence_lengths - 1
             logits = self._embed["logits"](text_transformed)
-            pred_ids = self._crf(logits, sequence_lengths)
+            pred_ids = self._crf(logits, sequence_lengths - 1)
             out["e_ids"] = pred_ids
 
         return out
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 4da49c184277..8cde56f4c7c2 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -602,7 +602,7 @@ def _sample_negatives(
 
     @staticmethod
     def sim(
-        a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
+        a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"] = None
     ) -> "tf.Tensor":
         """Calculate similarity between given tensors."""
 

From 55282783c9d3688820acfac5b95587831bbac008 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 12:41:34 +0100
Subject: [PATCH 109/633] update requirements

---
 .../embedding_intent_classifier.py            |  81 +++----------
 rasa/utils/plotter.py                         | 114 ------------------
 requirements.txt                              |  19 +--
 3 files changed, 24 insertions(+), 190 deletions(-)
 delete mode 100644 rasa/utils/plotter.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6494a672f4d3..688e6d51c762 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -13,7 +13,6 @@
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 
 import rasa.utils.io as io_utils
-from rasa.utils.plotter import Plotter
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -165,10 +164,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
     def _check_old_config_variables(config: Dict[Text, Any]) -> None:
         """Config migration warning"""
 
-        removed_tokenization_params = [
-            "label_tokenization_flag",
-            "label_split_symbol",
-        ]
+        removed_tokenization_params = ["label_tokenization_flag", "label_split_symbol"]
         for removed_param in removed_tokenization_params:
             if removed_param in config:
                 warnings.warn(
@@ -501,7 +497,7 @@ def _extract_labels_precomputed_features(
     #     labels_example: List["Message"],
     # ) -> List[np.ndarray]:
     #     """Compute one-hot representation for the labels"""
-    # 
+    #
     #     return [
     #         np.array(
     #             [
@@ -816,9 +812,7 @@ def _predict(self, message: "Message"):
 
         return self.predict_func(batch_in)
 
-    def _predict_label(
-            self, out
-    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
+    def _predict_label(self, out) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
         """Predicts the intent of the provided message."""
 
         label = {"name": None, "confidence": 0.0}
@@ -874,7 +868,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
             tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
 
         entities = self._convert_tags_to_entities(
-            message.text, message.get("tokens", []), tags,predictions
+            message.text, message.get("tokens", []), tags, predictions
         )
 
         extracted = self.add_extractor_name(entities)
@@ -884,7 +878,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
 
     @staticmethod
     def _convert_tags_to_entities(
-        text: str, tokens: List[Token], tags: List[Text],predictions
+        text: str, tokens: List[Token], tags: List[Text], predictions
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = "O"
@@ -910,7 +904,7 @@ def _convert_tags_to_entities(
             last_tag = tag
 
         for entity in entities:
-            entity["value"] = text[entity["start"]: entity["end"]]
+            entity["value"] = text[entity["start"] : entity["end"]]
 
         return entities
 
@@ -956,11 +950,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             if e.errno != errno.EEXIST:
                 raise
 
-        self.model.save_weights(tf_model_file, save_format='tf')
+        self.model.save_weights(tf_model_file, save_format="tf")
 
         dummy_session_data = {
-            k: [v[:1] for v in vs]
-            for k, vs in self.model.session_data.items()
+            k: [v[:1] for v in vs] for k, vs in self.model.session_data.items()
         }
 
         with open(
@@ -968,9 +961,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         ) as f:
             pickle.dump(dummy_session_data, f)
 
-        with open(
-            os.path.join(model_dir, file_name + ".label_data.pkl"), "wb"
-        ) as f:
+        with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "wb") as f:
             pickle.dump(self._label_data, f)
 
         with open(
@@ -1078,22 +1069,13 @@ def load(
                 meta["batch_strategy"],
             )
             logger.debug("Loading the model ...")
-            model.fit(
-                1,
-                1,
-                0,
-                0,
-                silent=True,
-                eager=True,
-            )
+            model.fit(1, 1, 0, 0, silent=True, eager=True)
             model.load_weights(tf_model_file)
 
             # build the graph for prediction
             model.set_training_phase(False)
             model.session_data = {
-                k: vs
-                for k, vs in model.session_data.items()
-                if "text" in k
+                k: vs for k, vs in model.session_data.items() if "text" in k
             }
             model.build_for_predict()
             predict_dataset = model.predict_dataset()
@@ -1189,9 +1171,7 @@ def __init__(
         self.session_data = session_data
         self.eval_session_data = eval_session_data
         label_batch = train_utils.prepare_batch(label_data)
-        self.tf_label_data = train_utils.batch_to_session_data(
-            label_batch, label_data
-        )
+        self.tf_label_data = train_utils.batch_to_session_data(label_batch, label_data)
 
         # options
         self._sparse_input_dropout = sparse_input_dropout
@@ -1401,11 +1381,7 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
         a_masked_embed = self._embed[f"{name}_token"](a_masked)
 
         return self._loss_mask(
-            a_t_masked_embed,
-            a_masked_embed,
-            a_masked,
-            a_masked_embed,
-            a_masked,
+            a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
         )
 
     def _build_all_b(self):
@@ -1424,13 +1400,7 @@ def _intent_loss(self, a, b):
         a_embed = self._embed["text"](a)
         b_embed = self._embed["label"](b)
 
-        return self._loss_label(
-            a_embed,
-            b_embed,
-            b,
-            all_labels_embed,
-            all_labels,
-        )
+        return self._loss_label(a_embed, b_embed, b, all_labels_embed, all_labels)
 
     def _entity_loss(
         self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor", sequence_lengths
@@ -1467,9 +1437,7 @@ def _entity_loss(
         return loss, f1
 
     def _train_losses_scores(self, batch_in):
-        tf_batch_data = train_utils.batch_to_session_data(
-            batch_in, self.session_data
-        )
+        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1497,9 +1465,7 @@ def _train_losses_scores(self, batch_in):
             cls = tf.gather_nd(text_transformed, idxs)
 
             label = self._create_bow(
-                tf_batch_data["label_features"],
-                tf_batch_data["label_mask"][0],
-                "label",
+                tf_batch_data["label_features"], tf_batch_data["label_mask"][0], "label"
             )
             loss, acc = self._intent_loss(cls, label)
             losses["i_loss"] = loss
@@ -1564,9 +1530,7 @@ def build_for_predict(self):
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
     def predict(self, batch_in):
-        tf_batch_data = train_utils.batch_to_session_data(
-            batch_in, self.session_data
-        )
+        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1586,8 +1550,7 @@ def predict(self, batch_in):
             cls_embed = self._embed["text"](cls)
 
             sim_all = self._loss_label.sim(
-                cls_embed[:, tf.newaxis, :],
-                self.all_labels_embed[tf.newaxis, :, :],
+                cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
             )
             # label = self._create_bow(
             #     tf_batch_data["label_features"],
@@ -1599,9 +1562,7 @@ def predict(self, batch_in):
             #     cls_embed[:, tf.newaxis, :], label_embed, None
             # )
 
-            scores = train_utils.confidence_from_sim(
-                sim_all, self._similarity_type
-            )
+            scores = train_utils.confidence_from_sim(sim_all, self._similarity_type)
             out["i_scores"] = scores
 
         if self._named_entity_recognition:
@@ -1613,7 +1574,5 @@ def predict(self, batch_in):
 
     def predict_dataset(self):
         return train_utils.create_tf_dataset(
-            self.session_data,
-            1,
-            label_key="label_ids",
+            self.session_data, 1, label_key="label_ids"
         )
diff --git a/rasa/utils/plotter.py b/rasa/utils/plotter.py
deleted file mode 100644
index 515ba7ec38c6..000000000000
--- a/rasa/utils/plotter.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from pathlib import Path
-from typing import Union, List, Text
-
-import numpy as np
-import csv
-
-
-# to enable %matplotlib inline if running in ipynb
-from IPython import get_ipython
-
-ipy = get_ipython()
-if ipy is not None:
-    ipy.run_line_magic("matplotlib", "inline")
-
-
-import matplotlib.pyplot as plt
-
-
-class Plotter(object):
-    """
-    Plots training parameters (loss, f-score, and accuracy) and training weights over time.
-    Input files are the output files 'loss.tsv' and 'weights.txt' from training either a sequence tagger or text
-    classification model.
-    """
-
-    @staticmethod
-    def _extract_evaluation_data(
-        file_name: Text, score: str = "loss", prefix: str = "i"
-    ) -> dict:
-        training_curves = {"train": [], "val": []}
-
-        with open(file_name, "r") as tsvin:
-            tsvin = csv.reader(tsvin, delimiter="\t")
-
-            # determine the column index of loss, f-score and accuracy for train, dev and test split
-            row = next(tsvin, None)
-
-            score = score.upper()
-
-            TRAIN_SCORE = (
-                row.index(f"{prefix.upper()}_{score.upper()}")
-                if f"{prefix.upper()}_{score.upper()}" in row
-                else None
-            )
-            VAL_SCORE = (
-                row.index(f"VAL_{prefix.upper()}_{score.upper()}")
-                if f"VAL_{prefix.upper()}_{score.upper()}" in row
-                else None
-            )
-
-            # then get all relevant values from the tsv
-            for row in tsvin:
-
-                if TRAIN_SCORE is not None:
-                    if row[TRAIN_SCORE] != "_":
-                        training_curves["train"].append(float(row[TRAIN_SCORE]))
-
-                if VAL_SCORE is not None:
-                    if VAL_SCORE < len(row) and row[VAL_SCORE] != "_":
-                        training_curves["val"].append(float(row[VAL_SCORE]))
-                    else:
-                        training_curves["val"].append(0.0)
-
-        return training_curves
-
-    def plot_training_curves(self, file_name: Union[Text], output_folder: Text):
-        if type(output_folder) is str:
-            output_folder = Path(output_folder)
-
-        metrics = {
-            "intent": {"scores": ["loss", "acc"], "prefix": "i"},
-            "entity": {"scores": ["loss", "f1"], "prefix": "e"},
-            "mask": {"scores": ["loss", "acc"], "prefix": "m"},
-        }
-
-        for metric_name, metric_values in metrics.items():
-
-            fig = plt.figure(figsize=(15, 10))
-
-            prefix = metric_values["prefix"]
-            scores = metric_values["scores"]
-
-            output_path = output_folder / f"training_{metric_name}.png"
-
-            for i, score in enumerate(scores):
-                training_curves = self._extract_evaluation_data(
-                    file_name, score, prefix
-                )
-
-                plt.subplot(len(scores), 1, i + 1)
-                if training_curves["train"]:
-                    x = np.arange(0, len(training_curves["train"]))
-                    plt.plot(
-                        x,
-                        training_curves["train"],
-                        label=f"train {metric_name} {score}",
-                    )
-                if training_curves["val"]:
-                    x = np.arange(0, len(training_curves["val"]))
-                    plt.plot(
-                        x, training_curves["val"], label=f"val {metric_name} {score}"
-                    )
-
-                plt.legend(bbox_to_anchor=(1.04, 0), loc="lower left", borderaxespad=0)
-                plt.ylabel(f"{metric_name} {score}")
-                plt.xlabel("epochs")
-
-            # save plots
-            plt.tight_layout(pad=1.0)
-            plt.savefig(output_path, dpi=300)
-            print(
-                f"Loss and acc plots are saved in {output_path}"
-            )  # to let user know the path of the save plots
-            plt.close(fig)
diff --git a/requirements.txt b/requirements.txt
index 4c1b9ae24cf8..6aff58027e82 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,13 +7,11 @@ jsonpickle==1.1
 redis==3.3.5
 pymongo[tls,srv]==3.8.0
 numpy==1.16.3
-scipy==1.2.1
-#tensorflow==1.15.0
+scipy==1.4.1
 absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
 # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
 setuptools >= 41.0.0
-tensorflow-probability==0.7.0
 tensor2tensor==1.14.0
 apscheduler==3.6.0
 tqdm==4.31.0
@@ -62,16 +60,7 @@ python-dateutil==2.8.0
 # https://github.com/tensorflow/tensorflow/issues/32319
 gast==0.2.2
 # for new featurizers
-tensorflow==1.14.0
+tensorflow==2.1.0
 tensorflow_hub==0.6.0
-#tensorflow_text[no-deps]==0.1.0
-torch
-torchvision
-transformers
-# for hermit evaluation
-pandas
-progress
-# for plotter
-ipython
-# to calculate f1 score in new architecture
-git+https://github.com/guillaumegenthial/tf_metrics.git
+tensorflow-addons==0.7.0
+tensorflow-probability==0.7.0
\ No newline at end of file

From 465c4ebe6cf7f3498a34374fa0a34fb32d453479 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 12:54:38 +0100
Subject: [PATCH 110/633] remove predictions argument

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 688e6d51c762..8d0521c88783 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -868,7 +868,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
             tags = [t[2:] if t[:2] in ["B-", "I-", "U-", "L-"] else t for t in tags]
 
         entities = self._convert_tags_to_entities(
-            message.text, message.get("tokens", []), tags, predictions
+            message.text, message.get("tokens", []), tags
         )
 
         extracted = self.add_extractor_name(entities)
@@ -878,7 +878,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
 
     @staticmethod
     def _convert_tags_to_entities(
-        text: str, tokens: List[Token], tags: List[Text], predictions
+        text: str, tokens: List[Token], tags: List[Text]
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = "O"

From b94b4f9d454f3c6bf6648d83f8512acae88edd7d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 13:20:14 +0100
Subject: [PATCH 111/633] remove bias option

---
 docs/nlu/components.rst                                   | 4 +---
 rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 255d96522859..b855e2c08c7e 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -329,6 +329,7 @@ TextFeaturizer
     low             Checks if the word is lower case.
     upper           Checks if the word is upper case.
     title           Checks if the word starts with an uppercase character and all remaining characters are lowercased.
+    digit           Checks if the word contains just digits.
     prefix5         Take the first five characters of the word.
     prefix2         Take the first two characters of the word.
     suffix5         Take the last five characters of the word.
@@ -337,8 +338,6 @@ TextFeaturizer
     suffix1         Take the last character of the word.
     pos             Take the Part-of-Speech tag of the word (spaCy required).
     pos2            Take the first two characters of the Part-of-Speech tag of the word (spaCy required).
-    bias            Adds "bias".
-    digit           Checks if the word contains just digits.
     ==============  =============================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
@@ -354,7 +353,6 @@ TextFeaturizer
           "features": [
             ["low", "title", "upper"],
             [
-              "bias",
               "low",
               "prefix5",
               "prefix2",
diff --git a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
index 3ef927835784..567d40cc1707 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
@@ -49,7 +49,6 @@ class TextFeaturizer(Featurizer):
         "features": [
             ["low", "title", "upper"],
             [
-                "bias",
                 "low",
                 "prefix5",
                 "prefix2",
@@ -75,7 +74,6 @@ class TextFeaturizer(Featurizer):
         "suffix1": lambda word: word.text[-1:],
         "pos": lambda word: word.pos_tag,
         "pos2": lambda word: word.pos_tag[:2],
-        "bias": lambda word: "bias",
         "upper": lambda word: word.text.isupper(),
         "digit": lambda word: word.text.isdigit(),
     }

From ff9bb32d79e484cd2cfd7cde0acfa9d0006e14f8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 13:43:54 +0100
Subject: [PATCH 112/633] fix gathering indeces

---
 rasa/utils/tf_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index 8cde56f4c7c2..4d855f501216 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -527,7 +527,7 @@ def _sample_idxs(
 
         tiled = tf.tile(tf.expand_dims(x, 0), (batch_size, 1, 1))
 
-        return tf.gather(tiled, idxs, batch_dims=-1)
+        return tf.gather(tiled, idxs, batch_dims=1)
 
     def _get_bad_mask(
         self, labels: "tf.Tensor", target_labels: "tf.Tensor", idxs: "tf.Tensor"

From d5f018736e0f7b01016d3a3ae20ef96c181250d1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 13:45:31 +0100
Subject: [PATCH 113/633] add types

---
 .../embedding_intent_classifier.py            |  6 ++-
 rasa/utils/tf_layers.py                       | 50 +++++++++----------
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b2679ecf4415..578f67674ddd 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1187,7 +1187,7 @@ def _build_all_b(self):
 
         return all_labels_embed, all_labels
 
-    def _intent_loss(self, a, b):
+    def _intent_loss(self, a: tf.Tensor, b: tf.Tensor):
         all_labels_embed, all_labels = self._build_all_b()
 
         a_embed = self._embed["text"](a)
@@ -1229,7 +1229,9 @@ def _entity_loss(
 
         return loss, f1
 
-    def _train_losses_scores(self, batch_in):
+    def _train_losses_scores(
+        self, batch_in: Tuple[np.ndarray]
+    ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
         tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index bf601b3edb7e..0ebf4fe101d1 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -20,7 +20,7 @@
 
 
 class SparseDropout(tf.keras.layers.Dropout):
-    def call(self, inputs, training):
+    def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
 
         to_retain_prob = tf.random.uniform(
             tf.shape(inputs.values), 0, 1, inputs.values.dtype
@@ -76,7 +76,7 @@ def __init__(
         droprate: float,
         reg_lambda: float,
         layer_name_suffix: Text,
-    ):
+    ) -> None:
         super(ReluFfn, self).__init__(name=f"ffnn_{layer_name_suffix}")
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
@@ -92,7 +92,7 @@ def __init__(
             )
             self._ffn_layers.append(tf.keras.layers.Dropout(rate=droprate))
 
-    def call(self, x, training):
+    def call(self, x: tf.Tensor, training: bool) -> tf.Tensor:
         for layer in self._ffn_layers:
             x = layer(x, training=training)
 
@@ -370,7 +370,6 @@ def call(self, x, pad_mask, training):
 
 
 class InputMask(tf.keras.layers.Layer):
-
     def build(self, input_shape):
         initializer = tf.keras.initializers.GlorotUniform()
         self.mask_vector = self.add_weight(
@@ -423,7 +422,6 @@ def call(self, x, mask, training):
 
 
 class CRF(tf.keras.layers.Layer):
-
     def __init__(self, num_tags, reg_lambda, name=None):
         super().__init__(name=name)
 
@@ -443,7 +441,8 @@ def call(self, logits, sequence_lengths):
         )
         # set prediction index for padding to `0`
         mask = tf.sequence_mask(
-            sequence_lengths, maxlen=tf.shape(pred_ids)[1], dtype=pred_ids.dtype)
+            sequence_lengths, maxlen=tf.shape(pred_ids)[1], dtype=pred_ids.dtype
+        )
 
         return pred_ids * mask
 
@@ -455,7 +454,6 @@ def loss(self, logits, tag_indices, sequence_lengths):
 
 
 class DotProductLoss(tf.keras.layers.Layer):
-
     def __init__(
         self,
         num_neg: int,
@@ -465,7 +463,7 @@ def __init__(
         use_max_sim_neg: bool,
         neg_lambda: float,
         scale_loss: bool,
-        name=None
+        name=None,
     ):
         super().__init__(name=name)
         self.num_neg = num_neg
@@ -497,7 +495,8 @@ def rand_idxs():
             """Create random tensor of indices"""
             # (1, num_neg)
             return tf.expand_dims(
-                tf.random.shuffle(tf.range(total_candidates))[:self.num_neg], 0)
+                tf.random.shuffle(tf.range(total_candidates))[: self.num_neg], 0
+            )
 
         def cond(i, out):
             """Condition for while loop"""
@@ -509,7 +508,7 @@ def body(i, out):
                 # increment counter
                 i + 1,
                 # add random indices
-                tf.concat([out, rand_idxs()], 0)
+                tf.concat([out, rand_idxs()], 0),
             ]
 
         # first tensor already created
@@ -607,9 +606,7 @@ def _sample_negatives(
         )
 
     @staticmethod
-    def sim(
-        a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
-    ) -> "tf.Tensor":
+    def sim(a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]) -> "tf.Tensor":
         """Calculate similarity between given tensors."""
 
         sim = tf.reduce_sum(a * b, -1)
@@ -635,15 +632,21 @@ def _train_sim(
         neg_inf = tf.constant(-1e9)
 
         sim_pos = self.sim(pos_inputs_embed, pos_labels_embed, mask)
-        sim_neg_il = self.sim(pos_inputs_embed, neg_labels_embed, mask) + neg_inf * labels_bad_negs
+        sim_neg_il = (
+            self.sim(pos_inputs_embed, neg_labels_embed, mask)
+            + neg_inf * labels_bad_negs
+        )
         sim_neg_ll = (
-                self.sim(pos_labels_embed, neg_labels_embed, mask) + neg_inf * labels_bad_negs
+            self.sim(pos_labels_embed, neg_labels_embed, mask)
+            + neg_inf * labels_bad_negs
         )
         sim_neg_ii = (
-                self.sim(pos_inputs_embed, neg_inputs_embed, mask) + neg_inf * inputs_bad_negs
+            self.sim(pos_inputs_embed, neg_inputs_embed, mask)
+            + neg_inf * inputs_bad_negs
         )
         sim_neg_li = (
-                self.sim(pos_labels_embed, neg_inputs_embed, mask) + neg_inf * inputs_bad_negs
+            self.sim(pos_labels_embed, neg_inputs_embed, mask)
+            + neg_inf * inputs_bad_negs
         )
 
         # output similarities between user input and bot actions
@@ -757,7 +760,8 @@ def _chosen_loss(self) -> Callable:
             return self._loss_softmax
         else:
             raise ValueError(
-                f"Wrong loss type '{self.loss_type}', " f"should be 'margin' or 'softmax'"
+                f"Wrong loss type '{self.loss_type}', "
+                f"should be 'margin' or 'softmax'"
             )
 
     def call(
@@ -779,7 +783,8 @@ def call(
             inputs_bad_negs,
             labels_bad_negs,
         ) = self._sample_negatives(
-            inputs_embed, labels_embed, labels, all_labels_embed, all_labels)
+            inputs_embed, labels_embed, labels, all_labels_embed, all_labels
+        )
 
         # calculate similarities
         sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li = self._train_sim(
@@ -795,12 +800,7 @@ def call(
         acc = self._calc_accuracy(sim_pos, sim_neg_il)
 
         loss = self._chosen_loss(
-            sim_pos,
-            sim_neg_il,
-            sim_neg_ll,
-            sim_neg_ii,
-            sim_neg_li,
-            mask,
+            sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li, mask
         )
 
         return loss, acc

From 31e9600cd866a23f05a4db872d9ec03bacca1709 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 16 Jan 2020 13:55:47 +0100
Subject: [PATCH 114/633] handle options

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8d0521c88783..1a2e6e723bc8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1257,6 +1257,7 @@ def __init__(
             self.eval_metrics["val_m_acc"] = tf.keras.metrics.Mean(name="val_m_acc")
         else:
             self._input_mask = None
+            self._loss_mask = None
 
         if self._intent_classification:
             self._embed["text"] = tf_layers.Embed(
@@ -1278,6 +1279,8 @@ def __init__(
             self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
             self.eval_metrics["val_i_loss"] = tf.keras.metrics.Mean(name="val_i_loss")
             self.eval_metrics["val_i_acc"] = tf.keras.metrics.Mean(name="val_i_acc")
+        else:
+            self._loss_label = None
 
         if self._named_entity_recognition:
             self._embed["logits"] = tf_layers.Embed(

From 9d56582903d9b1290d092e4ea48653ad6cf88857 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 14:44:24 +0100
Subject: [PATCH 115/633] add types

---
 .../embedding_intent_classifier.py            | 125 +++++++++---------
 rasa/utils/tf_layers.py                       |  78 +++++++----
 rasa/utils/tf_models.py                       |  58 ++++----
 3 files changed, 143 insertions(+), 118 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 031f70989a9e..acba82f73034 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -214,9 +214,9 @@ def __init__(
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         inverted_tag_dict: Optional[Dict[int, Text]] = None,
         model: Optional[tf_models.RasaModel] = None,
-        predict_func: Optional[Callable] = None,
+        predict_func: Optional[tf.Function] = None,
         batch_tuple_sizes: Optional[Dict] = None,
-        attention_weights: Optional["tf.Tensor"] = None,
+        attention_weights: Optional[tf.Tensor] = None,
     ) -> None:
         """Declare instance variables with default values"""
 
@@ -252,7 +252,7 @@ def __init__(
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
-        training_data: "TrainingData", attribute: Text
+        training_data: TrainingData, attribute: Text
     ) -> Dict[Text, int]:
         """Create label_id dictionary"""
 
@@ -264,7 +264,7 @@ def _create_label_id_dict(
         }
 
     @staticmethod
-    def _create_tag_id_dict(training_data: "TrainingData") -> Dict[Text, int]:
+    def _create_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
         """Create label_id dictionary"""
 
         distinct_tag_ids = set(
@@ -284,15 +284,17 @@ def _create_tag_id_dict(training_data: "TrainingData") -> Dict[Text, int]:
 
     @staticmethod
     def _find_example_for_label(
-        label: Text, examples: List["Message"], attribute: Text
-    ) -> Optional["Message"]:
+        label: Text, examples: List[Message], attribute: Text
+    ) -> Optional[Message]:
         for ex in examples:
             if ex.get(attribute) == label:
                 return ex
         return None
 
     @staticmethod
-    def _find_example_for_tag(tag, examples, attribute):
+    def _find_example_for_tag(
+        tag: Text, examples: List[Message], attribute: Text
+    ) -> Optional[Message]:
         for ex in examples:
             for e in ex.get(attribute):
                 if e["entity"] == tag:
@@ -301,7 +303,7 @@ def _find_example_for_tag(tag, examples, attribute):
 
     @staticmethod
     def _check_labels_features_exist(
-        labels_example: List["Message"], attribute: Text
+        labels_example: List[Message], attribute: Text
     ) -> bool:
         """Check if all labels have features set"""
 
@@ -315,7 +317,7 @@ def _check_labels_features_exist(
 
     @staticmethod
     def _extract_and_add_features(
-        message: "Message", attribute: Text
+        message: Message, attribute: Text
     ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]:
         sparse_features = None
         dense_features = None
@@ -370,14 +372,14 @@ def _add_mask_to_session_data(
                 break
 
     @staticmethod
-    def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
+    def _get_num_of_features(session_data: SessionDataType, key: Text) -> int:
         num_features = 0
         for data in session_data[key]:
             if data.size > 0:
                 num_features += data[0].shape[-1]
         return num_features
 
-    def check_input_dimension_consistency(self, session_data: "SessionDataType"):
+    def check_input_dimension_consistency(self, session_data: SessionDataType):
         if self.component_config[SHARE_HIDDEN_LAYERS]:
             num_text_features = self._get_num_of_features(session_data, "text_features")
             num_intent_features = self._get_num_of_features(
@@ -392,7 +394,7 @@ def check_input_dimension_consistency(self, session_data: "SessionDataType"):
                 )
 
     def _extract_labels_precomputed_features(
-        self, label_examples: List["Message"], attribute: Text = INTENT_ATTRIBUTE
+        self, label_examples: List[Message], attribute: Text = INTENT_ATTRIBUTE
     ) -> List[np.ndarray]:
         """Collect precomputed encodings"""
 
@@ -411,26 +413,9 @@ def _extract_labels_precomputed_features(
 
         return [sparse_features, dense_features]
 
-    # @staticmethod
-    # def _compute_default_label_features(
-    #     labels_example: List["Message"],
-    # ) -> List[np.ndarray]:
-    #     """Compute one-hot representation for the labels"""
-    #
-    #     return [
-    #         np.array(
-    #             [
-    #                 scipy.sparse.coo_matrix(
-    #                     ([1], ([0], [idx])), shape=(1, len(labels_example))
-    #                 )
-    #                 for idx in range(len(labels_example))
-    #             ]
-    #         )
-    #     ]
-
     @staticmethod
     def _compute_default_label_features(
-        labels_example: List["Message"],
+        labels_example: List[Message],
     ) -> List[np.ndarray]:
         """Compute one-hot representation for the labels"""
 
@@ -445,10 +430,10 @@ def _compute_default_label_features(
 
     def _create_label_data(
         self,
-        training_data: "TrainingData",
+        training_data: TrainingData,
         label_id_dict: Dict[Text, int],
         attribute: Text,
-    ) -> "SessionDataType":
+    ) -> SessionDataType:
         """Create matrix with label_ids encoded in rows as bag of words.
 
         Find a training example for each label and get the encoded features
@@ -495,11 +480,11 @@ def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]
 
     def _create_session_data(
         self,
-        training_data: List["Message"],
+        training_data: List[Message],
         label_id_dict: Optional[Dict[Text, int]] = None,
         tag_id_dict: Optional[Dict[Text, int]] = None,
         label_attribute: Optional[Text] = None,
-    ) -> "SessionDataType":
+    ) -> SessionDataType:
         """Prepare data for training and create a SessionDataType object"""
 
         X_sparse = []
@@ -563,7 +548,7 @@ def _create_session_data(
         return session_data
 
     # train helpers
-    def preprocess_train_data(self, training_data: "TrainingData"):
+    def preprocess_train_data(self, training_data: TrainingData) -> SessionDataType:
         """Prepares data for training.
 
         Performs sanity checks on training data, extracts encodings for labels.
@@ -594,13 +579,13 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         return session_data
 
     @staticmethod
-    def _check_enough_labels(session_data: "SessionDataType") -> bool:
+    def _check_enough_labels(session_data: SessionDataType) -> bool:
         return len(np.unique(session_data["label_ids"])) >= 2
 
     def train(
         self,
-        training_data: "TrainingData",
-        cfg: Optional["RasaNLUModelConfig"] = None,
+        training_data: TrainingData,
+        cfg: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
         """Train the embedding intent classifier on a data set."""
@@ -658,7 +643,7 @@ def train(
         # self.attention_weights = train_utils.extract_attention(self.attention_weights)
 
     # process helpers
-    def _predict(self, message: "Message"):
+    def _predict(self, message: Message) -> tf.Function:
         if self.model is None or self.predict_func is None:
             return
 
@@ -670,7 +655,9 @@ def _predict(self, message: "Message"):
 
         return self.predict_func(batch_in)
 
-    def _predict_label(self, out) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
+    def _predict_label(
+        self, out: Dict[Text, tf.Tensor]
+    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
         """Predicts the intent of the provided message."""
 
         label = {"name": None, "confidence": 0.0}
@@ -708,7 +695,9 @@ def _predict_label(self, out) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
 
         return label, label_ranking
 
-    def _predict_entities(self, out, message: "Message") -> List[Dict]:
+    def _predict_entities(
+        self, out: Dict[Text, tf.Tensor], message: Message
+    ) -> List[Dict]:
         if self.model is None:
             logger.error(
                 "There is no trained tf.session: "
@@ -733,7 +722,7 @@ def _predict_entities(self, out, message: "Message") -> List[Dict]:
 
     @staticmethod
     def _convert_tags_to_entities(
-        text: str, tokens: List[Token], tags: List[Text]
+        text: Text, tokens: List[Token], tags: List[Text]
     ) -> List[Dict[Text, Any]]:
         entities = []
         last_tag = "O"
@@ -763,7 +752,7 @@ def _convert_tags_to_entities(
 
         return entities
 
-    def process(self, message: "Message", **kwargs: Any) -> None:
+    def process(self, message: Message, **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
         out = self._predict(message)
@@ -949,7 +938,7 @@ def __init__(
         label_data: SessionDataType,
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
-    ):
+    ) -> None:
         super(DIET, self).__init__(name="DIET")
 
         # data
@@ -978,7 +967,7 @@ def __init__(
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
-    def _prepare_layers(self, session_data: SessionDataType):
+    def _prepare_layers(self, session_data: SessionDataType) -> None:
         self._sparse_dropout = tf_layers.SparseDropout(rate=self.config[DROPRATE])
 
         self._sparse_to_dense = {
@@ -1102,7 +1091,7 @@ def _prepare_layers(self, session_data: SessionDataType):
             self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
             self.eval_metrics["val_e_f1"] = tf.keras.metrics.Mean(name="val_e_f1")
 
-    def set_training_phase(self, training: bool):
+    def set_training_phase(self, training: bool) -> None:
         if training:
             self.training = tf.ones((), tf.bool)
         else:
@@ -1110,11 +1099,11 @@ def set_training_phase(self, training: bool):
 
     def _combine_sparse_dense_features(
         self,
-        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-        mask: "tf.Tensor",
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask: tf.Tensor,
         name: Text,
         sparse_dropout: bool = False,
-    ) -> "tf.Tensor":
+    ) -> tf.Tensor:
 
         dense_features = []
 
@@ -1133,18 +1122,18 @@ def _combine_sparse_dense_features(
 
     def _create_bow(
         self,
-        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-        mask: "tf.Tensor",
+        features: List[Union[tf.Tensor, "tf.SparseTensor"]],
+        mask: tf.Tensor,
         name: Text,
-    ):
+    ) -> tf.Tensor:
 
         x = self._combine_sparse_dense_features(features, mask, name)
         return self._ffnn[name](tf.reduce_sum(x, 1), self.training)
 
     def _create_sequence(
         self,
-        features: List[Union["tf.Tensor", "tf.SparseTensor"]],
-        mask: "tf.Tensor",
+        features: List[Union[tf.Tensor, "tf.SparseTensor"]],
+        mask: tf.Tensor,
         name: Text,
         masked_lm_loss: bool = False,
     ):
@@ -1191,7 +1180,7 @@ def _build_all_b(self):
 
         return all_labels_embed, all_labels
 
-    def _intent_loss(self, a: tf.Tensor, b: tf.Tensor):
+    def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         all_labels_embed, all_labels = self._build_all_b()
 
         a_embed = self._embed["text"](a)
@@ -1200,8 +1189,8 @@ def _intent_loss(self, a: tf.Tensor, b: tf.Tensor):
         return self._loss_label(a_embed, b_embed, b, all_labels_embed, all_labels)
 
     def _entity_loss(
-        self, a: "tf.Tensor", c: "tf.Tensor", mask: "tf.Tensor", sequence_lengths
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        self, a: tf.Tensor, c: tf.Tensor, mask: tf.Tensor, sequence_lengths
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
 
         # remove cls token
         sequence_lengths = sequence_lengths - 1
@@ -1233,7 +1222,9 @@ def _entity_loss(
 
         return loss, f1
 
-    def _train_losses_scores(self, batch_in):
+    def _train_losses_scores(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
         tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
@@ -1279,7 +1270,9 @@ def _train_losses_scores(self, batch_in):
 
         return losses, scores
 
-    def train_on_batch(self, batch_in):
+    def train_on_batch(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> None:
         with tf.GradientTape() as tape:
             losses, scores = self._train_losses_scores(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
@@ -1295,7 +1288,7 @@ def train_on_batch(self, batch_in):
         for k, v in scores.items():
             self.train_metrics[k].update_state(v)
 
-    def train_dataset(self, batch_size):
+    def train_dataset(self, batch_size: int) -> tf.data.Dataset:
         return train_utils.create_tf_dataset(
             self.session_data,
             batch_size,
@@ -1304,7 +1297,7 @@ def train_dataset(self, batch_size):
             shuffle=True,
         )
 
-    def eval(self, batch_in):
+    def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
         losses, scores = self._train_losses_scores(batch_in)
         total_loss = tf.math.add_n(list(losses.values())) + self.losses
 
@@ -1314,19 +1307,21 @@ def eval(self, batch_in):
         for k, v in scores.items():
             self.eval_metrics[f"val_{k}"].update_state(v)
 
-    def eval_dataset(self, batch_size):
+    def eval_dataset(self, batch_size: int) -> tf.data.Dataset:
         if self.eval_session_data is not None:
             return train_utils.create_tf_dataset(
                 self.eval_session_data, batch_size, label_key="label_ids"
             )
 
-    def build_for_predict(self):
+    def build_for_predict(self) -> None:
         self.batch_tuple_sizes = train_utils.batch_tuple_sizes(self.session_data)
 
         all_labels_embed, _ = self._build_all_b()
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
-    def predict(self, batch_in):
+    def predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> Dict[Text, tf.Tensor]:
         tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
 
         mask_text = tf_batch_data["text_mask"][0]
@@ -1371,7 +1366,7 @@ def predict(self, batch_in):
 
         return out
 
-    def predict_dataset(self):
+    def predict_dataset(self) -> tf.data.Dataset:
         return train_utils.create_tf_dataset(
             self.session_data, 1, label_key="label_ids"
         )
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index cae204f8b6ca..d62a6de579f9 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -38,14 +38,14 @@ class DenseForSparse(tf.keras.layers.Dense):
     """Dense layer for sparse input tensor"""
 
     # noinspection PyPep8Naming
-    def __init__(self, reg_lambda: float, **kwargs):
+    def __init__(self, reg_lambda: float, **kwargs) -> None:
         l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
 
         super(DenseForSparse, self).__init__(
             kernel_regularizer=l1_regularizer, **kwargs
         )
 
-    def call(self, inputs):
+    def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         if not isinstance(inputs, tf.SparseTensor):
             raise ValueError("Input tensor should be sparse.")
 
@@ -108,7 +108,7 @@ def __init__(
         reg_lambda: float,
         layer_name_suffix: Text,
         similarity_type: Optional[Text] = None,
-    ):
+    ) -> None:
         super(Embed, self).__init__(name=f"embed_{layer_name_suffix}")
 
         self.similarity_type = similarity_type
@@ -126,7 +126,7 @@ def __init__(
             name=f"embed_layer_{layer_name_suffix}",
         )
 
-    def call(self, x):
+    def call(self, x: tf.Tensor) -> tf.Tensor:
         x = self._dense(x)
         if self.similarity_type == "cosine":
             x = tf.nn.l2_normalize(x, -1)
@@ -178,7 +178,7 @@ def _scaled_dot_product_attention(q, k, v, pad_mask):
 
         return output, attention_weights
 
-    def __init__(self, d_model, num_heads, reg_lambda):
+    def __init__(self, d_model, num_heads: int, reg_lambda: float) -> None:
         super(MultiHeadAttention, self).__init__()
         self.num_heads = num_heads
         self.d_model = d_model
@@ -199,7 +199,7 @@ def __init__(self, d_model, num_heads, reg_lambda):
         )
         self._dense = tf.keras.layers.Dense(d_model, kernel_regularizer=l2_regularizer)
 
-    def _split_heads(self, x):
+    def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
         """Split the last dimension into (num_heads, depth).
 
         Transpose the result such that the shape is
@@ -209,7 +209,7 @@ def _split_heads(self, x):
         x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth))
         return tf.transpose(x, perm=[0, 2, 1, 3])
 
-    def _combine_heads(self, x):
+    def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
         """Inverse of split_heads.
 
         Args:
@@ -226,7 +226,13 @@ def _combine_heads(self, x):
             x, (tf.shape(x)[0], -1, self.d_model)
         )  # (batch_size, seq_len_q, d_model)
 
-    def call(self, v, k, q, pad_mask=None):
+    def call(
+        self,
+        v: tf.Tensor,
+        k: tf.Tensor,
+        q: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
         q = self._wq(q)  # (batch_size, seq_len_q, d_model)
         k = self._wk(k)  # (batch_size, seq_len_k, d_model)
         v = self._wv(v)  # (batch_size, seq_len_v, d_model)
@@ -248,7 +254,14 @@ def call(self, v, k, q, pad_mask=None):
 
 
 class TransformerEncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, d_model, num_heads, dff, reg_lambda, rate=0.1):
+    def __init__(
+        self,
+        d_model: tf.Tensor,
+        num_heads: int,
+        dff: tf.Tensor,
+        reg_lambda: float,
+        rate: float = 0.1,
+    ) -> None:
         super(TransformerEncoderLayer, self).__init__()
 
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
@@ -268,7 +281,7 @@ def __init__(self, d_model, num_heads, dff, reg_lambda, rate=0.1):
             tf.keras.layers.Dropout(rate),
         ]
 
-    def call(self, x, pad_mask, training):
+    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: bool) -> tf.Tensor:
 
         x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
         attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
@@ -285,7 +298,7 @@ def call(self, x, pad_mask, training):
 
 class TransformerEncoder(tf.keras.layers.Layer):
     @staticmethod
-    def _look_ahead_pad_mask(seq_len):
+    def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
         pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
         return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
 
@@ -295,7 +308,7 @@ def _get_angles(pos, i, d_model):
         return pos * angle_rates
 
     @classmethod
-    def _positional_encoding(cls, position, d_model):
+    def _positional_encoding(cls, position, d_model) -> tf.Tensor:
         angle_rads = cls._get_angles(
             np.arange(position)[:, np.newaxis],
             np.arange(d_model)[np.newaxis, :],
@@ -314,16 +327,16 @@ def _positional_encoding(cls, position, d_model):
 
     def __init__(
         self,
-        num_layers,
+        num_layers: int,
         d_model,
-        num_heads,
+        num_heads: int,
         dff,
-        max_seq_length,
-        reg_lambda,
-        rate=0.1,
-        unidirectional=False,
-        name=None,
-    ):
+        max_seq_length: int,
+        reg_lambda: float,
+        rate: float = 0.1,
+        unidirectional: bool = False,
+        name: Optional[Text] = None,
+    ) -> None:
         super(TransformerEncoder, self).__init__(name=name)
 
         self.d_model = d_model
@@ -344,7 +357,9 @@ def __init__(
         ]
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
-    def call(self, x, pad_mask, training):
+    def call(
+        self, x: "tf.Tensor", pad_mask: "tf.Tensor", training: bool
+    ) -> "tf.Tensor":
 
         # adding embedding and position encoding.
         x = self._embedding(x)  # (batch_size, seq_len, d_model)
@@ -370,7 +385,7 @@ def call(self, x, pad_mask, training):
 
 
 class InputMask(tf.keras.layers.Layer):
-    def build(self, input_shape):
+    def build(self, input_shape: List[int]) -> None:
         initializer = tf.keras.initializers.GlorotUniform()
         self.mask_vector = self.add_weight(
             shape=(1, 1, input_shape[-1]),
@@ -380,7 +395,9 @@ def build(self, input_shape):
         )
         self.built = True
 
-    def call(self, x, mask, training):
+    def call(
+        self, x: "tf.Tensor", mask: "tf.Tensor", training: bool
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Randomly mask input sequences."""
 
         # do not substitute with cls token
@@ -422,7 +439,7 @@ def call(self, x, mask, training):
 
 
 class CRF(tf.keras.layers.Layer):
-    def __init__(self, num_tags, reg_lambda, name=None):
+    def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
         super().__init__(name=name)
 
         initializer = tf.keras.initializers.GlorotUniform()
@@ -435,7 +452,7 @@ def __init__(self, num_tags, reg_lambda, name=None):
             name="transitions",
         )
 
-    def call(self, logits, sequence_lengths):
+    def call(self, logits: "tf.Tensor", sequence_lengths: "tf.Tensor") -> "tf.TEnsor":
         pred_ids, _ = tfa.text.crf.crf_decode(
             logits, self.transition_params, sequence_lengths
         )
@@ -446,7 +463,12 @@ def call(self, logits, sequence_lengths):
 
         return pred_ids * mask
 
-    def loss(self, logits, tag_indices, sequence_lengths):
+    def loss(
+        self,
+        logits: "tf.Tensor",
+        tag_indices: "tf.Tensor",
+        sequence_lengths: "tf.Tensor",
+    ) -> "tf.Tensor":
         log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
             logits, tag_indices, sequence_lengths, self.transition_params
         )
@@ -463,8 +485,8 @@ def __init__(
         use_max_sim_neg: bool,
         neg_lambda: float,
         scale_loss: bool,
-        name=None,
-    ):
+        name: Text = None,
+    ) -> None:
         super().__init__(name=name)
         self.num_neg = num_neg
         self.loss_type = loss_type
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 3d7bd88febf3..460ed4cda96d 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -21,13 +21,9 @@
 
 
 class RasaModel(tf.keras.models.Model):
-
-    def compile(self):
-        raise NotImplemented
-
     @staticmethod
     def _update_postfix_dict(
-            postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
+        postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
     ) -> Dict[Text, Text]:
         for name, value in metrics.loss.items():
             postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
@@ -35,14 +31,15 @@ def _update_postfix_dict(
             postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
         return postfix_dict
 
-    def fit(self,
-            epochs: int,
-            batch_size: Union[List[int], int],
-            evaluate_on_num_examples: int,
-            evaluate_every_num_epochs: int,
-            silent: bool = False,
-            eager: bool = False,
-            output_file: Optional[Text] = None,
+    def fit(
+        self,
+        epochs: int,
+        batch_size: Union[List[int], int],
+        evaluate_on_num_examples: int,
+        evaluate_every_num_epochs: int,
+        silent: bool = False,
+        eager: bool = False,
+        output_file: Optional[Text] = None,
     ) -> None:
         """Train tf graph"""
 
@@ -63,7 +60,8 @@ def fit(self,
             # allows increasing batch size
             train_dataset_func = tf.function(self.train_dataset)
             train_on_batch_func = tf.function(
-                self.train_on_batch, input_signature=[train_dataset_func(tf_batch_size).element_spec]
+                self.train_on_batch,
+                input_signature=[train_dataset_func(tf_batch_size).element_spec],
             )
 
         if evaluate_on_num_examples > 0:
@@ -73,7 +71,8 @@ def fit(self,
             else:
                 eval_dataset_func = tf.function(self.eval_dataset)
                 eval_func = tf.function(
-                    self.eval, input_signature=[eval_dataset_func(tf_batch_size).element_spec]
+                    self.eval,
+                    input_signature=[eval_dataset_func(tf_batch_size).element_spec],
                 )
         else:
             eval_dataset_func = None
@@ -97,7 +96,9 @@ def fit(self,
             # exit()
 
             # Get the metric results
-            postfix_dict = {k: v.result().numpy() for k, v in self.train_metrics.items()}
+            postfix_dict = {
+                k: v.result().numpy() for k, v in self.train_metrics.items()
+            }
 
             if evaluate_on_num_examples > 0:
                 if (
@@ -125,26 +126,33 @@ def fit(self,
         if not disable:
             logger.info("Finished training.")
 
-    def evaluate(self):
+    def compile(self) -> None:
+        raise NotImplemented
+
+    def evaluate(self) -> None:
         pass
 
-    def predict(self):
+    def predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> Dict[Text, tf.Tensor]:
         pass
 
-    def train_on_batch(self, batch_in):
+    def train_on_batch(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> None:
         raise NotImplementedError
 
-    def test_on_batch(self):
+    def test_on_batch(self) -> None:
         raise NotImplemented
 
-    def predict_on_batch(self):
+    def predict_on_batch(self) -> None:
         raise NotImplemented
 
-    def fit_generator(self):
+    def fit_generator(self) -> None:
         raise NotImplemented
 
-    def evaluate_generator(self):
+    def evaluate_generator(self) -> None:
         raise NotImplemented
 
-    def predict_generator(self):
-        raise NotImplemented
\ No newline at end of file
+    def predict_generator(self) -> None:
+        raise NotImplemented

From 5fe4b4844ca4ce8eeeb7ca9a9e8282cc2ecc3c2a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 16:47:36 +0100
Subject: [PATCH 116/633] remove session data from DIET

---
 .../embedding_intent_classifier.py            | 105 +++++++++++-------
 rasa/utils/tf_models.py                       |  34 +++---
 rasa/utils/train_utils.py                     |  19 ++--
 3 files changed, 90 insertions(+), 68 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index acba82f73034..9ddacc5ec404 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -18,7 +18,7 @@
 from rasa.utils import train_utils
 from rasa.utils import tf_layers
 from rasa.utils import tf_models
-from rasa.utils.train_utils import SessionDataType
+from rasa.utils.train_utils import SessionDataType, SessionDataSignature
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -33,11 +33,10 @@
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.config import RasaNLUModelConfig
-    from rasa.nlu.training_data import TrainingData
-    from rasa.nlu.model import Metadata
-    from rasa.nlu.training_data import Message
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message
 
 
 shapes, types = None, None
@@ -214,7 +213,7 @@ def __init__(
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         inverted_tag_dict: Optional[Dict[int, Text]] = None,
         model: Optional[tf_models.RasaModel] = None,
-        predict_func: Optional[tf.Function] = None,
+        predict_func: Optional[Callable] = None,
         batch_tuple_sizes: Optional[Dict] = None,
         attention_weights: Optional[tf.Tensor] = None,
     ) -> None:
@@ -602,7 +601,7 @@ def train(
 
             if not possible_to_train:
                 logger.error(
-                    "Can not train a classifier. "
+                    "Can not train intent classifier. "
                     "Need at least 2 different classes. "
                     "Skipping training of classifier."
                 )
@@ -622,9 +621,10 @@ def train(
         # set random seed
         tf.random.set_seed(self.component_config[RANDOM_SEED])
 
+        session_data_signature = self.create_signature(session_data)
+
         self.model = DIET(
-            session_data,
-            eval_session_data,
+            session_data_signature,
             self._label_data,
             self.inverted_tag_dict,
             self.component_config,
@@ -633,6 +633,8 @@ def train(
         self.model.fit(
             self.component_config[EPOCHS],
             self.component_config[BATCH_SIZES],
+            session_data,
+            eval_session_data,
             self.component_config[EVAL_NUM_EXAMPLES],
             self.component_config[EVAL_NUM_EPOCHS],
         )
@@ -642,8 +644,18 @@ def train(
 
         # self.attention_weights = train_utils.extract_attention(self.attention_weights)
 
+    @staticmethod
+    def create_signature(session_data: SessionDataType):
+        return {
+            key: [
+                (True if isinstance(v[0], scipy.sparse.spmatrix) else False, v[0].shape)
+                for v in values
+            ]
+            for key, values in session_data.items()
+        }
+
     # process helpers
-    def _predict(self, message: Message) -> tf.Function:
+    def _predict(self, message: Message) -> Optional[Callable]:
         if self.model is None or self.predict_func is None:
             return
 
@@ -903,16 +915,21 @@ def load(
 
 class DIET(tf_models.RasaModel):
     @staticmethod
-    def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
+    def _create_sparse_dense_layer(
+        data_signature: List[Tuple[bool, List[int]]],
+        name: Text,
+        reg_lambda: float,
+        dense_dim: int,
+    ) -> Optional[tf_layers.DenseForSparse]:
 
         sparse = False
-        for v in values:
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                sparse = True
+        for is_sparse, shape in data_signature:
+            if is_sparse:
+                sparse = is_sparse
             else:
                 # if dense features are present
                 # use the feature dimension of the dense features
-                dense_dim = v[0].shape[-1]
+                dense_dim = shape[-1]
 
         if sparse:
             return tf_layers.DenseForSparse(
@@ -920,21 +937,20 @@ def _create_sparse_dense_layer(values, name, reg_lambda, dense_dim):
             )
 
     @staticmethod
-    def _input_dim(values, dense_dim):
+    def _input_dim(data_signature: List[Tuple[bool, List[int]]], dense_dim: int) -> int:
 
-        for v in values:
-            if not isinstance(v[0], scipy.sparse.spmatrix):
+        for is_sparse, shape in data_signature:
+            if not is_sparse:
                 # if dense features are present
                 # use the feature dimension of the dense features
-                dense_dim = v[0].shape[-1]
+                dense_dim = shape[-1]
                 break
 
-        return dense_dim * len(values)
+        return dense_dim * len(data_signature)
 
     def __init__(
         self,
-        session_data: SessionDataType,
-        eval_session_data: Optional[SessionDataType],
+        session_data_signature: SessionDataSignature,
         label_data: SessionDataType,
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
@@ -942,16 +958,17 @@ def __init__(
         super(DIET, self).__init__(name="DIET")
 
         # data
-        self.session_data = session_data
-        self.eval_session_data = eval_session_data
+        self.session_data_signature = session_data_signature
         label_batch = train_utils.prepare_batch(label_data)
-        self.tf_label_data = train_utils.batch_to_session_data(label_batch, label_data)
+        self.tf_label_data = train_utils.batch_to_session_data(
+            label_batch, EmbeddingIntentClassifier.create_signature(label_data)
+        )
         self._num_tags = len(inverted_tag_dict)
 
         self.config = config
 
         # tf objects
-        self._prepare_layers(session_data)
+        self._prepare_layers()
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
@@ -967,18 +984,18 @@ def __init__(
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
-    def _prepare_layers(self, session_data: SessionDataType) -> None:
+    def _prepare_layers(self) -> None:
         self._sparse_dropout = tf_layers.SparseDropout(rate=self.config[DROPRATE])
 
         self._sparse_to_dense = {
             "text": self._create_sparse_dense_layer(
-                session_data["text_features"],
+                self.session_data_signature["text_features"],
                 "text",
                 self.config[C2],
                 self.config[DENSE_DIM]["text"],
             ),
             "label": self._create_sparse_dense_layer(
-                session_data["label_features"],
+                self.session_data_signature["label_features"],
                 "label",
                 self.config[C2],
                 self.config[DENSE_DIM]["label"],
@@ -1225,7 +1242,9 @@ def _entity_loss(
     def _train_losses_scores(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
-        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
+        tf_batch_data = train_utils.batch_to_session_data(
+            batch_in, self.session_data_signature
+        )
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1288,9 +1307,11 @@ def train_on_batch(
         for k, v in scores.items():
             self.train_metrics[k].update_state(v)
 
-    def train_dataset(self, batch_size: int) -> tf.data.Dataset:
+    def train_dataset(
+        self, batch_size: int, session_data: SessionDataType
+    ) -> tf.data.Dataset:
         return train_utils.create_tf_dataset(
-            self.session_data,
+            session_data,
             batch_size,
             label_key="label_ids",
             batch_strategy=self.config[BATCH_STRATEGY],
@@ -1307,10 +1328,12 @@ def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
         for k, v in scores.items():
             self.eval_metrics[f"val_{k}"].update_state(v)
 
-    def eval_dataset(self, batch_size: int) -> tf.data.Dataset:
-        if self.eval_session_data is not None:
+    def eval_dataset(
+        self, batch_size: int, session_data: Optional[SessionDataType]
+    ) -> tf.data.Dataset:
+        if session_data is not None:
             return train_utils.create_tf_dataset(
-                self.eval_session_data, batch_size, label_key="label_ids"
+                session_data, batch_size, label_key="label_ids"
             )
 
     def build_for_predict(self) -> None:
@@ -1322,7 +1345,9 @@ def build_for_predict(self) -> None:
     def predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.session_data)
+        tf_batch_data = train_utils.batch_to_session_data(
+            batch_in, self.session_data_signature
+        )
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1366,7 +1391,5 @@ def predict(
 
         return out
 
-    def predict_dataset(self) -> tf.data.Dataset:
-        return train_utils.create_tf_dataset(
-            self.session_data, 1, label_key="label_ids"
-        )
+    def predict_dataset(self, session_data: SessionDataType) -> tf.data.Dataset:
+        return train_utils.create_tf_dataset(session_data, 1, label_key="label_ids")
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 460ed4cda96d..8f230bc43a3d 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -1,22 +1,13 @@
-import typing
+import numpy as np
 import logging
-from typing import (
-    List,
-    Optional,
-    Text,
-    Dict,
-    Tuple,
-    Union,
-    Generator,
-    Callable,
-    Any,
-    NamedTuple,
-)
+from typing import List, Optional, Text, Dict, Tuple, Union
 from tqdm import tqdm
 from rasa.utils import train_utils
 from rasa.utils.common import is_logging_disabled
 import tensorflow as tf
 
+from rasa.utils.train_utils import SessionDataType
+
 logger = logging.getLogger(__name__)
 
 
@@ -35,11 +26,12 @@ def fit(
         self,
         epochs: int,
         batch_size: Union[List[int], int],
+        session_data: SessionDataType,
+        eval_session_data: Optional[SessionDataType],
         evaluate_on_num_examples: int,
         evaluate_every_num_epochs: int,
         silent: bool = False,
-        eager: bool = False,
-        output_file: Optional[Text] = None,
+        eager: bool = True,
     ) -> None:
         """Train tf graph"""
 
@@ -61,7 +53,9 @@ def fit(
             train_dataset_func = tf.function(self.train_dataset)
             train_on_batch_func = tf.function(
                 self.train_on_batch,
-                input_signature=[train_dataset_func(tf_batch_size).element_spec],
+                input_signature=[
+                    train_dataset_func(tf_batch_size, session_data).element_spec
+                ],
             )
 
         if evaluate_on_num_examples > 0:
@@ -72,7 +66,9 @@ def fit(
                 eval_dataset_func = tf.function(self.eval_dataset)
                 eval_func = tf.function(
                     self.eval,
-                    input_signature=[eval_dataset_func(tf_batch_size).element_spec],
+                    input_signature=[
+                        eval_dataset_func(tf_batch_size, eval_session_data).element_spec
+                    ],
                 )
         else:
             eval_dataset_func = None
@@ -89,7 +85,7 @@ def fit(
 
             # Train on batches
             self.set_training_phase(True)
-            for batch_in in train_dataset_func(ep_batch_size):
+            for batch_in in train_dataset_func(ep_batch_size, session_data):
                 train_on_batch_func(batch_in)
 
             # print(self.metrics)
@@ -112,7 +108,7 @@ def fit(
 
                     # Eval on batches
                     self.set_training_phase(False)
-                    for batch_in in eval_dataset_func(ep_batch_size):
+                    for batch_in in eval_dataset_func(ep_batch_size, eval_session_data):
                         eval_func(batch_in)
 
                 # Get the metric results
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index f5093105f857..da223c9789c3 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -36,6 +36,10 @@
 
 # type for all tf session related data
 SessionDataType = Dict[Text, List[np.ndarray]]
+# signature for all session related data
+# (boolean indicates whether data are sparse or not)
+# (list values represent the shape)
+SessionDataSignature = Dict[Text, List[Tuple[bool, List[int]]]]
 
 
 # namedtuple for training metrics
@@ -419,7 +423,8 @@ def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
 
 
 def batch_to_session_data(
-    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionDataType
+    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
+    session_data_signature: SessionDataSignature,
 ) -> Dict[Text, List[tf.Tensor]]:
     """Convert input batch tensors into batch data format.
 
@@ -432,15 +437,15 @@ def batch_to_session_data(
     batch_data = defaultdict(list)
 
     idx = 0
-    for k, values in session_data.items():
-        for v in values:
-            if isinstance(v[0], scipy.sparse.spmatrix):
+    for k, signature in session_data_signature.items():
+        for is_sparse, shape in signature:
+            if is_sparse:
                 # explicitly substitute last dimension in shape with known static value
                 batch_data[k].append(
                     tf.SparseTensor(
                         batch[idx],
                         batch[idx + 1],
-                        [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
+                        [batch[idx + 2][0], batch[idx + 2][1], shape[-1]],
                     )
                 )
                 idx += 3
@@ -451,9 +456,7 @@ def batch_to_session_data(
     return batch_data
 
 
-def batch_tuple_sizes(
-    session_data: SessionDataType
-) -> Dict[Text, int]:
+def batch_tuple_sizes(session_data: SessionDataType) -> Dict[Text, int]:
 
     # save the amount of placeholders attributed to session data keys
     tuple_sizes = defaultdict(int)

From fbcf599439bf8eede498bd9a58d18538734a192d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 17:07:25 +0100
Subject: [PATCH 117/633] Fix saving and loading

---
 .../embedding_intent_classifier.py            | 46 +++++++++++--------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9ddacc5ec404..c526ab1ec7f0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -248,6 +248,8 @@ def __init__(
 
         self._tf_config = train_utils.load_tf_config(self.component_config)
 
+        self.session_data_example = None
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -617,6 +619,11 @@ def train(
         else:
             eval_session_data = None
 
+        # keep one example for persisting and loading
+        self.session_data_example = {
+            k: [v[:1] for v in vs] for k, vs in session_data.items()
+        }
+
         # TODO set it in the model
         # set random seed
         tf.random.set_seed(self.component_config[RANDOM_SEED])
@@ -661,8 +668,7 @@ def _predict(self, message: Message) -> Optional[Callable]:
 
         # create session data from message and convert it into a batch of 1
         session_data = self._create_session_data([message])
-        self.model.session_data = session_data
-        predict_dataset = self.model.predict_dataset()
+        predict_dataset = self.model.predict_dataset(session_data)
         batch_in = next(iter(predict_dataset))
 
         return self.predict_func(batch_in)
@@ -802,14 +808,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
 
         self.model.save_weights(tf_model_file, save_format="tf")
 
-        dummy_session_data = {
-            k: [v[:1] for v in vs] for k, vs in self.model.session_data.items()
-        }
-
         with open(
-            os.path.join(model_dir, file_name + ".dummy_session_data.pkl"), "wb"
+            os.path.join(model_dir, file_name + ".session_data_example.pkl"), "wb"
         ) as f:
-            pickle.dump(dummy_session_data, f)
+            pickle.dump(self.session_data_example, f)
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "wb") as f:
             pickle.dump(self._label_data, f)
@@ -857,9 +859,9 @@ def load(
             _tf_config = pickle.load(f)
 
         with open(
-            os.path.join(model_dir, file_name + ".dummy_session_data.pkl"), "rb"
+            os.path.join(model_dir, file_name + ".session_data_example.pkl"), "rb"
         ) as f:
-            dummy_session_data = pickle.load(f)
+            session_data_example = pickle.load(f)
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
             label_data = pickle.load(f)
@@ -883,19 +885,25 @@ def load(
             elif meta[LOSS_TYPE] == "margin":
                 meta[SIMILARITY_TYPE] = "cosine"
 
-        model = DIET(dummy_session_data, None, label_data, inv_tag_dict, meta)
+        model = DIET(
+            EmbeddingIntentClassifier.create_signature(session_data_example),
+            label_data,
+            inv_tag_dict,
+            meta,
+        )
 
         logger.debug("Loading the model ...")
-        model.fit(1, 1, 0, 0, silent=True, eager=True)
+        model.fit(1, 1, session_data_example, None, 0, 0, silent=True, eager=True)
         model.load_weights(tf_model_file)
 
         # build the graph for prediction
         model.set_training_phase(False)
-        model.session_data = {
-            k: vs for k, vs in model.session_data.items() if "text" in k
-        }
-        model.build_for_predict()
-        predict_dataset = model.predict_dataset()
+        session_data = {k: vs for k, vs in session_data_example.items() if "text" in k}
+        model.session_data_signature = EmbeddingIntentClassifier.create_signature(
+            session_data
+        )
+        model.build_for_predict(session_data)
+        predict_dataset = model.predict_dataset(session_data)
         predict_func = tf.function(
             model.predict, input_signature=[predict_dataset.element_spec]
         )
@@ -1336,8 +1344,8 @@ def eval_dataset(
                 session_data, batch_size, label_key="label_ids"
             )
 
-    def build_for_predict(self) -> None:
-        self.batch_tuple_sizes = train_utils.batch_tuple_sizes(self.session_data)
+    def build_for_predict(self, session_data: SessionDataType) -> None:
+        self.batch_tuple_sizes = train_utils.batch_tuple_sizes(session_data)
 
         all_labels_embed, _ = self._build_all_b()
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())

From bb90259c3836121956f930c9618b5932b9976959 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 16 Jan 2020 17:14:58 +0100
Subject: [PATCH 118/633] correct types

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c526ab1ec7f0..4d89383739fd 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -662,7 +662,7 @@ def create_signature(session_data: SessionDataType):
         }
 
     # process helpers
-    def _predict(self, message: Message) -> Optional[Callable]:
+    def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
         if self.model is None or self.predict_func is None:
             return
 

From 76775e0a92669fcd51b38bd57a37bb32ce89b2e0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 13:46:07 +0100
Subject: [PATCH 119/633] fix non eager mode

---
 .../embedding_intent_classifier.py            |  4 ++--
 rasa/utils/tf_models.py                       | 24 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4d89383739fd..13e14b58dbff 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1316,7 +1316,7 @@ def train_on_batch(
             self.train_metrics[k].update_state(v)
 
     def train_dataset(
-        self, batch_size: int, session_data: SessionDataType
+        self, batch_size: "tf.Tensor", session_data: SessionDataType
     ) -> tf.data.Dataset:
         return train_utils.create_tf_dataset(
             session_data,
@@ -1337,7 +1337,7 @@ def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
             self.eval_metrics[f"val_{k}"].update_state(v)
 
     def eval_dataset(
-        self, batch_size: int, session_data: Optional[SessionDataType]
+        self, batch_size: "tf.Tensor", session_data: Optional[SessionDataType]
     ) -> tf.data.Dataset:
         if session_data is not None:
             return train_utils.create_tf_dataset(
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 8f230bc43a3d..b302c208876a 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -44,31 +44,31 @@ def fit(
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
         tf_batch_size = tf.ones((), tf.int32)
+
         if eager:
             # allows increasing batch size
-            train_dataset_func = self.train_dataset
+            train_dataset_func = lambda x: self.train_dataset(x, session_data)
             train_on_batch_func = self.train_on_batch
         else:
             # allows increasing batch size
-            train_dataset_func = tf.function(self.train_dataset)
+            train_dataset_func = tf.function(
+                func=lambda x: self.train_dataset(x, session_data)
+            )
             train_on_batch_func = tf.function(
                 self.train_on_batch,
-                input_signature=[
-                    train_dataset_func(tf_batch_size, session_data).element_spec
-                ],
+                input_signature=[train_dataset_func(1).element_spec],
             )
 
         if evaluate_on_num_examples > 0:
             if eager:
-                eval_dataset_func = self.eval_dataset
+                eval_dataset_func = lambda x: self.eval_dataset(x, eval_session_data)
                 eval_func = self.eval
             else:
-                eval_dataset_func = tf.function(self.eval_dataset)
+                eval_dataset_func = tf.function(
+                    func=lambda x: self.eval_dataset(x, eval_session_data)
+                )
                 eval_func = tf.function(
-                    self.eval,
-                    input_signature=[
-                        eval_dataset_func(tf_batch_size, eval_session_data).element_spec
-                    ],
+                    self.eval, input_signature=[eval_dataset_func(1).element_spec]
                 )
         else:
             eval_dataset_func = None
@@ -85,7 +85,7 @@ def fit(
 
             # Train on batches
             self.set_training_phase(True)
-            for batch_in in train_dataset_func(ep_batch_size, session_data):
+            for batch_in in train_dataset_func(ep_batch_size):
                 train_on_batch_func(batch_in)
 
             # print(self.metrics)

From 03e4f6805c93f301f751f23ffcee64caf9f3948c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 14:37:13 +0100
Subject: [PATCH 120/633] clean up

---
 .../pretrained_lm_featurizer.py               | 291 ------------------
 rasa/nlu/test.py                              |  56 ----
 rasa/nlu/tokenizers/convert_tokenizer.py      | 124 --------
 3 files changed, 471 deletions(-)
 delete mode 100644 rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
 delete mode 100644 rasa/nlu/tokenizers/convert_tokenizer.py

diff --git a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
deleted file mode 100644
index 32f3a2d88a12..000000000000
--- a/rasa/nlu/featurizers/dense_featurizer/pretrained_lm_featurizer.py
+++ /dev/null
@@ -1,291 +0,0 @@
-from typing import Any, Dict, Optional, Text
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurzier import Featurizer
-from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
-)
-import torch
-import re
-from transformers import *
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-model_dictionary = {
-    "bert-base-uncased": BertModel,
-    "openai-gpt": OpenAIGPTModel,
-    # "gpt2": GPT2Model,
-    # "transfo-xl-wt103": TransfoXLModel,
-    # "xlnet-base-cased": XLNetModel,
-    # "xlm-mlm-enfr-1024": XLMModel,
-    # "distilbert-base-uncased": DistilBertModel,
-    # "roberta-base": RobertaModel,
-}
-
-tokenizer_dictionary = {
-    "bert-base-uncased": BertTokenizer,
-    "openai-gpt": OpenAIGPTTokenizer,
-    # "gpt2": GPT2Tokenizer,
-    # "transfo-xl-wt103": TransfoXLTokenizer,
-    # "xlnet-base-cased": XLNetTokenizer,
-    # "xlm-mlm-enfr-1024": XLMTokenizer,
-    # "distilbert-base-uncased": DistilBertTokenizer,
-    # "roberta-base": RobertaTokenizer,
-}
-
-special_tokens_present = {
-    "bert-base-uncased": True,
-    "openai-gpt": False,
-    # "gpt2": False,
-    # "transfo-xl-wt103": False,
-    # "xlnet-base-cased": True,
-    # "xlm-mlm-enfr-1024": True,
-    # "distilbert-base-uncased": True,
-    # "roberta-base": True,
-}
-
-
-class PreTrainedLMFeaturizer(Featurizer):
-
-    provides = [
-        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    defaults = {
-        # model key identified by HF Transformers
-        "model_key": "bert-base-uncased"
-    }
-
-    def _load_transformers_params(self):
-
-        self.lm_key = self.component_config["lm_key"]
-
-        if self.lm_key not in tokenizer_dictionary:
-            logger.error("{} not a valid model key name".format(self.lm_key))
-            raise
-
-        logger.info("Loading Tokenizer and Model for {}".format(self.lm_key))
-        self.tokenizer = tokenizer_dictionary[self.lm_key].from_pretrained(self.lm_key)
-        self.model = model_dictionary[self.lm_key].from_pretrained(self.lm_key)
-        self.contains_special_token = special_tokens_present[self.lm_key]
-        if self.contains_special_token:
-            self.pad_token_id = self.tokenizer.pad_token_id
-        else:
-            special_tokens_dict = {"pad_token": "[PAD]"}
-            self.tokenizer.add_special_tokens(special_tokens_dict)
-            self.model.resize_token_embeddings(len(self.tokenizer))
-            self.pad_token_id = self.tokenizer.pad_token_id
-
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-
-        super(PreTrainedLMFeaturizer, self).__init__(component_config)
-
-        self._load_transformers_params()
-
-    def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
-        **kwargs: Any,
-    ) -> None:
-
-        bs = 128
-
-        for attribute in [MESSAGE_TEXT_ATTRIBUTE]:
-
-            start_index = 0
-
-            while start_index < len(training_data.intent_examples):
-
-                end_index = min(start_index + bs, len(training_data.intent_examples))
-                batch_examples = training_data.intent_examples[start_index:end_index]
-                batch_text = [
-                    self._clean_text(ex.get(attribute)) for ex in batch_examples
-                ]
-
-                batch_feats = self._compute_features(batch_text)
-
-                for index, ex in enumerate(batch_examples):
-
-                    ex.set(
-                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
-                        self._combine_with_existing_dense_features(
-                            ex,
-                            batch_feats[index],
-                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
-                        ),
-                    )
-
-                    # print(ex.get(attribute), batch_feats[index].shape[0])
-
-                start_index += bs
-
-        # for example in training_data.intent_examples:
-        #     for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
-        #         self._set_lm_features(example, attribute)
-
-    @staticmethod
-    def _clean_text(text):
-
-        cleaned_text = re.sub(
-            # there is a space or an end of a string after it
-            r"[^\w#@&]+(?=\s|$)|"
-            # there is a space or beginning of a string before it
-            # not followed by a number
-            r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
-            # not in between numbers and not . or @ or & or - or #
-            # e.g. 10'000.00 or blabla@gmail.com
-            # and not url characters
-            r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
-            " ",
-            text,
-        )
-
-        if not cleaned_text.strip():
-            cleaned_text = text
-
-        return cleaned_text.strip()
-
-    def _compute_input_ids(self, batch_examples):
-
-        batch_input_ids = []
-        max_seq_len = 0
-        actual_seq_lengths = []
-        for example in batch_examples:
-
-            example_input_ids = self.tokenizer.encode(
-                example, add_special_tokens=self.contains_special_token
-            )
-            max_seq_len = max(max_seq_len, len(example_input_ids))
-            actual_seq_lengths.append(len(example_input_ids))
-            batch_input_ids.append(example_input_ids)
-
-        # add padding
-        padded_input_ids = []
-
-        # Some models don't contain pad token, we use unknown token as padding token.This doesn't affect the computation
-        # since we compute an attention mask anyways.
-
-        # pad_token_id = self.tokenizer.pad_token_id if self.contains_special_token else self.tokenizer.unk_token_id
-        for example_input_ids in batch_input_ids:
-            padded_input_ids.append(
-                example_input_ids
-                + [self.pad_token_id] * (max_seq_len - len(example_input_ids))
-            )
-
-        return torch.tensor(padded_input_ids), actual_seq_lengths
-
-    def _compute_attention_mask(self, actual_seq_lengths):
-
-        attention_mask = []
-        max_seq_length = max(actual_seq_lengths)
-        for index in range(len(actual_seq_lengths)):
-            example_seq_length = actual_seq_lengths[index]
-            attention_mask.append(
-                [1] * example_seq_length + [0] * (max_seq_length - example_seq_length)
-            )
-
-        attention_mask = np.array(attention_mask).astype(np.float32)
-
-        return torch.tensor(attention_mask)
-
-    def _compute_features(self, batch_inputs):
-
-        batch_model_inputs, actual_seq_lengths = self._compute_input_ids(batch_inputs)
-        batch_attention_mask = self._compute_attention_mask(actual_seq_lengths)
-
-        with torch.no_grad():
-            last_hidden_states = self.model(
-                batch_model_inputs, attention_mask=batch_attention_mask
-            )[
-                0
-            ].numpy()  # Models outputs are now numpy array
-            sequence_embedding = last_hidden_states  # First element of batch
-
-            truncated_embeds = self._extract_nonpadded_embeddings(
-                sequence_embedding, actual_seq_lengths
-            )
-
-            return truncated_embeds
-
-    def _extract_nonpadded_embeddings(self, embeddings, actual_seq_lengths):
-
-        truncated_embeds = []
-        for index, embedding in enumerate(embeddings):
-            unmasked_embedding = embedding[: actual_seq_lengths[index]]
-
-            if self.contains_special_token:
-                # dim - (seq + 2, hdim)
-                # Discard SEP token and move CLS token to last index
-                unmasked_embedding = unmasked_embedding[:-1, :]  # Discard SEP
-                unmasked_embedding = np.roll(
-                    unmasked_embedding, -1, axis=0
-                )  # Move CLS to back
-            else:
-                unmasked_embedding = np.concatenate(
-                    [unmasked_embedding, np.zeros((1, unmasked_embedding.shape[-1]))],
-                    axis=0,
-                )
-            truncated_embeds.append(unmasked_embedding)
-
-        return np.array(truncated_embeds)
-
-    def _set_lm_features(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
-
-        message_attribute_text = example.get(attribute)
-        if message_attribute_text:
-            # Encode text
-            input_ids = torch.tensor(
-                [
-                    self.tokenizer.encode(
-                        message_attribute_text,
-                        add_special_tokens=self.contains_special_token,
-                    )
-                ]
-            )  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
-            with torch.no_grad():
-                last_hidden_states = self.model(input_ids)[
-                    0
-                ].numpy()  # Models outputs are now numpy array
-                sequence_embedding = last_hidden_states[0]  # First element of batch
-
-                if self.contains_special_token:
-                    # dim - (seq + 2, hdim)
-                    # Discard SEP token and move CLS token to last index
-                    sequence_embedding = sequence_embedding[:-1]  # Discard SEP
-                    sequence_embedding = np.roll(
-                        sequence_embedding, -1
-                    )  # Move CLS to back
-                else:
-                    sequence_embedding = np.concatenate(
-                        [
-                            sequence_embedding,
-                            np.zeros((1, sequence_embedding.shape[-1])),
-                        ],
-                        axis=0,
-                    )
-
-                features = self._combine_with_existing_dense_features(
-                    example,
-                    sequence_embedding,
-                    MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
-                )
-                example.set(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute], features)
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-
-        cleaned_text = self._clean_text(message.get(MESSAGE_TEXT_ATTRIBUTE))
-
-        feats = self._compute_features([cleaned_text])
-        message.set(
-            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self._combine_with_existing_dense_features(
-                message,
-                feats[0],
-                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            ),
-        )
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index bc4dff62f7da..f842b87f854a 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -1114,10 +1114,6 @@ def run_evaluation(
             entity_results, extractors, output_directory, successes, errors
         )
 
-        write_prediction_for_hermit_evaluation(
-            entity_results, intent_results, extractors, output_directory
-        )
-
     return result
 
 
@@ -1554,58 +1550,6 @@ def return_entity_results(results: EntityMetrics, dataset_name: Text) -> None:
         return_results(result, dataset_name)
 
 
-def write_prediction_for_hermit_evaluation(
-    entity_results, intent_results, extractors, output_directory
-):
-    import json
-
-    out = []
-    aligned_predictions = align_all_entity_predictions(entity_results, extractors)
-
-    for intent_result, e_pred, entity_result in zip(
-        intent_results, aligned_predictions, entity_results
-    ):
-
-        entity_gold = e_pred["target_labels"]
-        entity_pred = e_pred["extractor_labels"]["EmbeddingIntentClassifier"]
-
-        last = "O"
-        for j in range(len(entity_pred)):
-            if entity_pred[j] != "O" and last != entity_pred[j]:
-                last = entity_pred[j]
-                entity_pred[j] = "B-" + entity_pred[j]
-            elif entity_pred[j] != "O" and last == entity_pred[j]:
-                last = entity_pred[j]
-                entity_pred[j] = "I-" + entity_pred[j]
-            else:
-                last = entity_pred[j]
-
-        last = "O"
-        for j in range(len(entity_gold)):
-            if entity_gold[j] != "O" and last != entity_gold[j]:
-                last = entity_gold[j]
-                entity_gold[j] = "B-" + entity_gold[j]
-            elif entity_gold[j] != "O" and last == entity_gold[j]:
-                last = entity_gold[j]
-                entity_gold[j] = "I-" + entity_gold[j]
-            else:
-                last = entity_gold[j]
-
-        obj = {
-            "tokens": [t.text for t in entity_result.tokens],
-            "intent_gold": [intent_result.intent_target for _ in entity_result.tokens],
-            "intent_pred": [
-                intent_result.intent_prediction for _ in entity_result.tokens
-            ],
-            "frame_element_gold": entity_gold,
-            "frame_element_pred": entity_pred,
-        }
-        out.append(obj)
-
-    with open(os.path.join(output_directory, "hermit_eval.json"), "w") as outfile:
-        json.dump(out, outfile, indent=2)
-
-
 if __name__ == "__main__":
     raise RuntimeError(
         "Calling `rasa.nlu.test` directly is no longer supported. Please use "
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
deleted file mode 100644
index 8b61624c1c6a..000000000000
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ /dev/null
@@ -1,124 +0,0 @@
-from typing import Any, Dict, List, Text
-
-from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from rasa.nlu.training_data import Message
-from rasa.nlu.constants import MESSAGE_ATTRIBUTES, TOKENS_NAMES
-import tensorflow as tf
-
-
-class ConveRTTokenizer(WhitespaceTokenizer):
-
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
-    defaults = {
-        # Flag to check whether to split intents
-        "intent_tokenization_flag": False,
-        # Symbol on which intent should be split
-        "intent_split_symbol": "_",
-        # Text will be tokenized with case sensitive as default
-        "case_sensitive": True,
-    }
-
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-        """Construct a new tokenizer using the WhitespaceTokenizer framework."""
-
-        super().__init__(component_config)
-
-        self._load_tokenizer_params()
-
-    def _load_tokenizer_params(self):
-
-        # needed to load the ConveRT model
-        import tensorflow_text
-        import tensorflow_hub as tfhub
-
-        self.graph = tf.Graph()
-        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-
-        with self.graph.as_default():
-            self.session = tf.Session()
-            self.module = tfhub.Module(model_url)
-
-            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.tokenized = self.module(self.text_placeholder, signature="tokenize")
-
-            self.session.run(tf.tables_initializer())
-            self.session.run(tf.global_variables_initializer())
-
-    def _tokenize(self, sentence: Text) -> Any:
-        return self.session.run(
-            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
-        )
-
-    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
-        """Tokenize the text using the ConveRT model.
-
-        ConveRT adds a special char in front of (some) words and splits words into
-        sub-words. To ensure the entity start and end values matches the token values,
-        tokenize the text first using the whitespace tokenizer. If individual tokens
-        are split up into multiple tokens, we make sure that the start end end value
-        of the first and last respective tokens stay the same.
-        """
-
-        # perform whitespace tokenization
-        tokens_in = super().tokenize(message, attribute)
-
-        tokens_out = []
-
-        for token in tokens_in:
-            token_start, token_end, token_text = token.start, token.end, token.text
-
-            # use ConveRT model to tokenize the text
-            split_token_strings = self._tokenize(token_text)[0]
-
-            # clean tokens (remove special chars and empty tokens)
-            split_token_strings = self._clean_tokens(split_token_strings)
-
-            _aligned_tokens = self._align_tokens(
-                split_token_strings, token_end, token_start
-            )
-            tokens_out += _aligned_tokens
-
-        return tokens_out
-
-    def _clean_tokens(self, tokens: List[bytes]):
-        """Encode tokens and remove special char added by ConveRT."""
-
-        tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens]
-        return [string for string in tokens if string]
-
-    def _align_tokens(self, tokens_in: List[Text], token_end: int, token_start: int):
-        """Align sub-tokens of ConveRT with tokens return by the WhitespaceTokenizer.
-
-        As ConveRT might split a single word into multiple tokens, we need to make
-        sure that the start and end value of first and last sub-token matches the
-        start and end value of the token return by the WhitespaceTokenizer as the
-        entities are using those start and end values.
-        """
-
-        tokens_out = []
-
-        current_token_offset = token_start
-
-        for index, string in enumerate(tokens_in):
-            if index == 0:
-                if index == len(tokens_in) - 1:
-                    s_token_end = token_end
-                else:
-                    s_token_end = current_token_offset + len(string)
-                tokens_out.append(Token(string, token_start, end=s_token_end))
-            elif index == len(tokens_in) - 1:
-                tokens_out.append(Token(string, current_token_offset, end=token_end))
-            else:
-                tokens_out.append(
-                    Token(
-                        string,
-                        current_token_offset,
-                        end=current_token_offset + len(string),
-                    )
-                )
-
-            current_token_offset += len(string)
-
-        return tokens_out

From a5ccecd49516faa8e55076232b56d6992f5c10ff Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 14:41:09 +0100
Subject: [PATCH 121/633] update requirements

---
 setup.py                         | 5 ++---
 tests/nlu/training/test_train.py | 3 ---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index a0924aa0eac8..09e28173c17f 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
     "pymongo[tls,srv]~=3.8",
     "numpy~=1.16",
     "scipy~=1.2",
-    "tensorflow~=2.0",
+    "tensorflow~=2.1",
     # absl is a tensorflow dependency, but produces double logging before 0.8
     # should be removed once tensorflow requires absl > 0.8 on its own
     "absl-py>=0.8.0",
@@ -86,8 +86,7 @@
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
-    # TODO requirements for convert on tf2.0
-    # "convert": ["tensorflow_text~=1.15.1", "tensorflow_hub~=0.6.0"],
+    "convert": ["tensorflow_text~=1.15.1", "tensorflow_hub~=0.6.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 042cbc1e4ccf..19bdef9cf575 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -69,9 +69,6 @@ def test_all_components_are_in_at_least_one_test_pipeline():
 
     all_components = [c["name"] for _, p in pipelines_for_tests() for c in p]
     for cls in registry.component_classes:
-        # different tokenization is needed
-        if cls.name == "PreTrainedLMFeaturizer":
-            continue
         assert (
             cls.name in all_components
         ), "`all_components` template is missing component."

From d3e78e253ac063a599bac0f572491a84118a6471 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 14:43:30 +0100
Subject: [PATCH 122/633] clean up

---
 rasa/nlu/components.py                        |   2 +-
 rasa/nlu/tokenizers/convert_tokenizer.py      | 124 ++++++++++++++++
 .../nlu/tokenizers/pretrained_lm_tokenizer.py | 139 ------------------
 3 files changed, 125 insertions(+), 140 deletions(-)
 create mode 100644 rasa/nlu/tokenizers/convert_tokenizer.py
 delete mode 100644 rasa/nlu/tokenizers/pretrained_lm_tokenizer.py

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index a0a51613170e..c305a2e2eb8a 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -452,7 +452,7 @@ def __add_to_cache(self, component: Component, cache_key: Optional[Text]) -> Non
 
         if cache_key is not None and self.use_cache:
             self.component_cache[cache_key] = component
-            logger.debug(
+            logger.info(
                 f"Added '{component.name}' to component cache. Key '{cache_key}'."
             )
 
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
new file mode 100644
index 000000000000..8b61624c1c6a
--- /dev/null
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -0,0 +1,124 @@
+from typing import Any, Dict, List, Text
+
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.training_data import Message
+from rasa.nlu.constants import MESSAGE_ATTRIBUTES, TOKENS_NAMES
+import tensorflow as tf
+
+
+class ConveRTTokenizer(WhitespaceTokenizer):
+
+    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+
+    defaults = {
+        # Flag to check whether to split intents
+        "intent_tokenization_flag": False,
+        # Symbol on which intent should be split
+        "intent_split_symbol": "_",
+        # Text will be tokenized with case sensitive as default
+        "case_sensitive": True,
+    }
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+        """Construct a new tokenizer using the WhitespaceTokenizer framework."""
+
+        super().__init__(component_config)
+
+        self._load_tokenizer_params()
+
+    def _load_tokenizer_params(self):
+
+        # needed to load the ConveRT model
+        import tensorflow_text
+        import tensorflow_hub as tfhub
+
+        self.graph = tf.Graph()
+        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+
+        with self.graph.as_default():
+            self.session = tf.Session()
+            self.module = tfhub.Module(model_url)
+
+            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
+            self.tokenized = self.module(self.text_placeholder, signature="tokenize")
+
+            self.session.run(tf.tables_initializer())
+            self.session.run(tf.global_variables_initializer())
+
+    def _tokenize(self, sentence: Text) -> Any:
+        return self.session.run(
+            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
+        )
+
+    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
+        """Tokenize the text using the ConveRT model.
+
+        ConveRT adds a special char in front of (some) words and splits words into
+        sub-words. To ensure the entity start and end values matches the token values,
+        tokenize the text first using the whitespace tokenizer. If individual tokens
+        are split up into multiple tokens, we make sure that the start end end value
+        of the first and last respective tokens stay the same.
+        """
+
+        # perform whitespace tokenization
+        tokens_in = super().tokenize(message, attribute)
+
+        tokens_out = []
+
+        for token in tokens_in:
+            token_start, token_end, token_text = token.start, token.end, token.text
+
+            # use ConveRT model to tokenize the text
+            split_token_strings = self._tokenize(token_text)[0]
+
+            # clean tokens (remove special chars and empty tokens)
+            split_token_strings = self._clean_tokens(split_token_strings)
+
+            _aligned_tokens = self._align_tokens(
+                split_token_strings, token_end, token_start
+            )
+            tokens_out += _aligned_tokens
+
+        return tokens_out
+
+    def _clean_tokens(self, tokens: List[bytes]):
+        """Encode tokens and remove special char added by ConveRT."""
+
+        tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens]
+        return [string for string in tokens if string]
+
+    def _align_tokens(self, tokens_in: List[Text], token_end: int, token_start: int):
+        """Align sub-tokens of ConveRT with tokens return by the WhitespaceTokenizer.
+
+        As ConveRT might split a single word into multiple tokens, we need to make
+        sure that the start and end value of first and last sub-token matches the
+        start and end value of the token return by the WhitespaceTokenizer as the
+        entities are using those start and end values.
+        """
+
+        tokens_out = []
+
+        current_token_offset = token_start
+
+        for index, string in enumerate(tokens_in):
+            if index == 0:
+                if index == len(tokens_in) - 1:
+                    s_token_end = token_end
+                else:
+                    s_token_end = current_token_offset + len(string)
+                tokens_out.append(Token(string, token_start, end=s_token_end))
+            elif index == len(tokens_in) - 1:
+                tokens_out.append(Token(string, current_token_offset, end=token_end))
+            else:
+                tokens_out.append(
+                    Token(
+                        string,
+                        current_token_offset,
+                        end=current_token_offset + len(string),
+                    )
+                )
+
+            current_token_offset += len(string)
+
+        return tokens_out
diff --git a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py b/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
deleted file mode 100644
index d9439e98b266..000000000000
--- a/rasa/nlu/tokenizers/pretrained_lm_tokenizer.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from typing import Any, Dict, Optional, Text
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
-from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
-)
-from transformers import *
-
-logger = logging.getLogger(__name__)
-
-tokenizer_dictionary = {
-    "bert-base-uncased": BertTokenizer,
-    "openai-gpt": OpenAIGPTTokenizer,
-    # "gpt2": GPT2Tokenizer,
-    # "transfo-xl-wt103": TransfoXLTokenizer,
-    # "xlnet-base-cased": XLNetTokenizer,
-    # "xlm-mlm-enfr-1024": XLMTokenizer,
-    # "distilbert-base-uncased": DistilBertTokenizer,
-    # "roberta-base": RobertaTokenizer,
-}
-
-special_tokens_present = {
-    "bert-base-uncased": True,
-    "openai-gpt": False,
-    # "gpt2": False,
-    # "transfo-xl-wt103": False,
-    # "xlnet-base-cased": True,
-    # "xlm-mlm-enfr-1024": True,
-    # "distilbert-base-uncased": True,
-    # "roberta-base": True,
-}
-
-
-class PreTrainedLMTokenizer(Tokenizer):
-
-    provides = [
-        MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    defaults = {
-        # model key identified by HF Transformers
-        "use_cls_token": True,
-        "lm_key": "bert-base-uncased",
-    }
-
-    def _load_tokenizer_params(self):
-
-        self.lm_key = self.component_config["lm_key"]
-
-        if self.lm_key not in tokenizer_dictionary:
-            logger.error("{} not a valid model key name".format(self.lm_key))
-            raise
-
-        logger.info("Loading Tokenizer for {}".format(self.lm_key))
-        self.tokenizer = tokenizer_dictionary[self.lm_key].from_pretrained(self.lm_key)
-        self.contains_special_token = special_tokens_present[self.lm_key]
-
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-
-        super(PreTrainedLMTokenizer, self).__init__(component_config)
-
-        self._load_tokenizer_params()
-
-    def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
-        **kwargs: Any,
-    ) -> None:
-
-        for example in training_data.intent_examples:
-            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
-                example.set(
-                    MESSAGE_TOKENS_NAMES[attribute],
-                    self._get_lm_tokens(example, attribute),
-                )
-
-    def _get_lm_tokens(self, example, attribute=MESSAGE_TEXT_ATTRIBUTE):
-
-        message_attribute_text = example.get(attribute)
-        if message_attribute_text:
-
-            expanded_tokens_list = []
-
-            # We assume that whitespace tokenizer was used before this and hence tokens attribute is set.
-            space_tokens_list = example.get(MESSAGE_TOKENS_NAMES[attribute])
-
-            for token in space_tokens_list:
-
-                token_start, token_end, token_text = token.offset, token.end, token.text
-
-                # Encode text
-
-                # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
-                split_token_ids = self.tokenizer.encode(token_text)
-
-                split_token_strings = self.tokenizer.convert_ids_to_tokens(
-                    split_token_ids
-                )
-
-                # print(split_token_strings)
-
-                current_token_offset = token_start
-                for index, string in enumerate(split_token_strings):
-                    if index == 0:
-                        if index == len(split_token_strings) - 1:
-                            s_token_end = token_end
-                        else:
-                            s_token_end = current_token_offset + len(string)
-                        expanded_tokens_list.append(
-                            Token(string, token_start, end=s_token_end)
-                        )
-                    elif index == len(split_token_strings) - 1:
-                        expanded_tokens_list.append(
-                            Token(string, current_token_offset, end=token_end)
-                        )
-                    else:
-                        expanded_tokens_list.append(
-                            Token(
-                                string,
-                                current_token_offset,
-                                end=current_token_offset + len(string),
-                            )
-                        )
-                    current_token_offset += len(string)
-
-            expanded_tokens_list = self.add_cls_token(expanded_tokens_list, attribute)
-
-            # print(message_attribute_text, len(space_tokens_list), len(expanded_tokens_list))
-
-            return expanded_tokens_list
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-
-        tokens = self._get_lm_tokens(message)
-        message.set(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], tokens)

From bd9f72d931a0d4ad6fb73701b477bdf6fcb27389 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 14:45:26 +0100
Subject: [PATCH 123/633] clean up imports

---
 .../embedding_intent_classifier.py            |  1 -
 rasa/utils/tf_layers.py                       | 14 +----------
 rasa/utils/train_utils.py                     | 25 +------------------
 3 files changed, 2 insertions(+), 38 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 13e14b58dbff..fcbcb661b7e0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -4,7 +4,6 @@
 import os
 import pickle
 import scipy.sparse
-import typing
 import warnings
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, Callable
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tf_layers.py
index d62a6de579f9..5fd8f82dd68b 100644
--- a/rasa/utils/tf_layers.py
+++ b/rasa/utils/tf_layers.py
@@ -1,17 +1,5 @@
 import logging
-import typing
-from typing import (
-    List,
-    Optional,
-    Text,
-    Dict,
-    Tuple,
-    Union,
-    Generator,
-    Callable,
-    Any,
-    NamedTuple,
-)
+from typing import List, Optional, Text, Tuple, Callable
 import tensorflow as tf
 import tensorflow_addons as tfa
 import numpy as np
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index da223c9789c3..aa2a0a3a4856 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -2,34 +2,11 @@
 import logging
 import scipy.sparse
 import typing
-from typing import (
-    List,
-    Optional,
-    Text,
-    Dict,
-    Tuple,
-    Union,
-    Generator,
-    Callable,
-    Any,
-    NamedTuple,
-)
+from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Any, NamedTuple
 import numpy as np
-from tqdm import tqdm
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
 
-# from tensor2tensor.models.transformer import (
-#     transformer_base,
-#     transformer_prepare_encoder,
-#     transformer_encoder,
-# )
-# from tensor2tensor.layers.common_attention import large_compatible_negative
-from rasa.utils.common import is_logging_disabled
-
-
-if typing.TYPE_CHECKING:
-    from tensor2tensor.utils.hparam import HParams
 
 logger = logging.getLogger(__name__)
 

From 7b0493d168f5a62981213d44d7e1090b7ecf7138 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 15:04:05 +0100
Subject: [PATCH 124/633] more clean up

---
 rasa/utils/train_utils.py        | 32 --------------------------------
 tests/nlu/training/test_train.py | 21 ---------------------
 2 files changed, 53 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index aa2a0a3a4856..dc0673480dda 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -573,38 +573,6 @@ def output_validation_stat(
     return ep_val_metrics
 
 
-def _write_training_metrics(
-    output_file: Text,
-    epoch: int,
-    train_metrics: TrainingMetrics,
-    val_metrics: TrainingMetrics,
-):
-    if output_file:
-        import datetime
-
-        # output log file
-        with open(output_file, "a") as f:
-            # make headers on first epoch
-            if epoch == 0:
-                f.write(f"EPOCH\tTIMESTAMP")
-                [f.write(f"\t{key.upper()}") for key in train_metrics.loss.keys()]
-                [f.write(f"\t{key.upper()}") for key in train_metrics.score.keys()]
-                [f.write(f"\tVAL_{key.upper()}") for key in train_metrics.loss.keys()]
-                [f.write(f"\tVAL_{key.upper()}") for key in train_metrics.score.keys()]
-
-            f.write(f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}")
-            [f.write(f"\t{val:.3f}") for val in train_metrics.loss.values()]
-            [f.write(f"\t{val:.3f}") for val in train_metrics.score.values()]
-            [
-                f.write(f"\t{val:.3f}") if val else f.write("\t0.0")
-                for val in val_metrics.loss.values()
-            ]
-            [
-                f.write(f"\t{val:.3f}") if val else f.write("\t0.0")
-                for val in val_metrics.score.values()
-            ]
-
-
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
     """Extract attention probabilities from t2t dict"""
 
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 19bdef9cf575..6085b4451099 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -153,27 +153,6 @@ async def test_train_model_no_events(language, pipeline, component_builder, tmpd
     assert loaded.parse("Hello today is Monday, again!") is not None
 
 
-@utilities.slowtest
-async def test_train_model_with_entities(component_builder, tmpdir):
-    _config = utilities.base_test_conf("supervised_embeddings")
-    _config.pipeline.remove({"name": "CRFEntityExtractor"})
-    (trained, _, persisted_path) = await train(
-        _config,
-        path=tmpdir.strpath,
-        data="./data/test/multiple_files_markdown",
-        component_builder=component_builder,
-    )
-    assert trained.pipeline
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    result = loaded.parse("list Italian restaurants")
-    assert result is not None
-    assert result["intent"]["name"] == "restaurant_search"
-    assert len(result["entities"]) == 1
-    assert result["entities"][0]["value"] == "Italian"
-    assert result["entities"][0]["entity"] == "cuisine"
-
-
 async def test_train_model_empty_pipeline(component_builder):
     # Should return an empty pipeline
     _config = utilities.base_test_conf(pipeline_template=None)

From 4858f0740e3962df8128abe92e6db40fb8110850 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 15:16:35 +0100
Subject: [PATCH 125/633] Rename TextFeaturizer to LexicalSyntacticFeaturizer

---
 docs/nlu/components.rst                              | 10 +++++-----
 ...featurizer.py => lexical_syntactic_featurizer.py} |  8 ++++----
 rasa/nlu/registry.py                                 |  6 ++++--
 tests/nlu/featurizers/test_text_featurizer.py        | 12 +++++++-----
 4 files changed, 20 insertions(+), 16 deletions(-)
 rename rasa/nlu/featurizers/sparse_featurizer/{text_featurizer.py => lexical_syntactic_featurizer.py} (97%)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index b855e2c08c7e..1d815f04971f 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -305,10 +305,10 @@ CountVectorsFeaturizer
           OOV_words: []  # list of strings
 
 
-TextFeaturizer
-~~~~~~~~~~~~~~~
+LexicalSyntacticFeaturizer
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Text feature creation to support entity extraction.
+:Short: Lexical and syntactic feature creation to support entity extraction.
 :Outputs:
    ``text_sparse_features``
 :Requires: ``tokens``
@@ -320,7 +320,7 @@ TextFeaturizer
     Features could for example be if a token is upper case, if it is a digit, or the prefix of that token (e.g.
     first two characters).
 :Configuration:
-    You need to configure what kind of text features the featurizer should extract.
+    You need to configure what kind of lexical and syntactic features the featurizer should extract.
     The following features are available:
 
     ==============  =============================================================================================
@@ -349,7 +349,7 @@ TextFeaturizer
     .. code-block:: yaml
 
         pipeline:
-        - name: "TextFeaturizer":
+        - name: "LexicalSyntacticFeaturizer":
           "features": [
             ["low", "title", "upper"],
             [
diff --git a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
similarity index 97%
rename from rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
rename to rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 567d40cc1707..21164a9f1dd4 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/text_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -34,7 +34,7 @@ class Word(typing.NamedTuple):
     pos_tag: Text
 
 
-class TextFeaturizer(Featurizer):
+class LexicalSyntacticFeaturizer(Featurizer):
 
     provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
 
@@ -289,9 +289,9 @@ def load(
         meta: Dict[Text, Any],
         model_dir: Optional[Text] = None,
         model_metadata: Optional["Metadata"] = None,
-        cached_component: Optional["TextFeaturizer"] = None,
+        cached_component: Optional["LexicalSyntacticFeaturizer"] = None,
         **kwargs: Any,
-    ) -> "TextFeaturizer":
+    ) -> "LexicalSyntacticFeaturizer":
 
         file_name = meta.get("file")
 
@@ -300,7 +300,7 @@ def load(
         ) as f:
             feature_to_idx_dict = pickle.load(f)
 
-        return TextFeaturizer(meta, feature_to_idx_dict=feature_to_idx_dict)
+        return LexicalSyntacticFeaturizer(meta, feature_to_idx_dict=feature_to_idx_dict)
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
         """Persist this model into the passed directory.
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index f78f9b79915e..bacc27dd6070 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -19,7 +19,9 @@
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
-from rasa.nlu.featurizers.sparse_featurizer.text_featurizer import TextFeaturizer
+from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
+    LexicalSyntacticFeaturizer,
+)
 from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
     CountVectorsFeaturizer,
 )
@@ -67,7 +69,7 @@
     SpacyFeaturizer,
     MitieFeaturizer,
     RegexFeaturizer,
-    TextFeaturizer,
+    LexicalSyntacticFeaturizer,
     CountVectorsFeaturizer,
     ConveRTFeaturizer,
     # classifiers
diff --git a/tests/nlu/featurizers/test_text_featurizer.py b/tests/nlu/featurizers/test_text_featurizer.py
index 971644dedf84..9d1c71a9fa8a 100644
--- a/tests/nlu/featurizers/test_text_featurizer.py
+++ b/tests/nlu/featurizers/test_text_featurizer.py
@@ -4,7 +4,9 @@
 import scipy.sparse
 
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from rasa.nlu.featurizers.sparse_featurizer.text_featurizer import TextFeaturizer
+from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
+    LexicalSyntacticFeaturizer,
+)
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.constants import TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES, SPACY_DOCS
 from rasa.nlu.training_data import Message
@@ -26,7 +28,7 @@
     ],
 )
 def test_text_featurizer(sentence, expected, expected_cls):
-    featurizer = TextFeaturizer(
+    featurizer = LexicalSyntacticFeaturizer(
         {"features": [["upper"], ["prefix2", "suffix2", "digit"], ["low"]]}
     )
 
@@ -61,7 +63,7 @@ def test_text_featurizer(sentence, expected, expected_cls):
     ],
 )
 def test_text_featurizer_window_size(sentence, expected, expected_cls):
-    featurizer = TextFeaturizer(
+    featurizer = LexicalSyntacticFeaturizer(
         {"features": [["upper"], ["digit"], ["low"], ["digit"]]}
     )
 
@@ -86,7 +88,7 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls):
 
 
 def test_text_featurizer_missing_spacy_nlp():
-    featurizer = TextFeaturizer({"features": [["pos", "pos2"]]})
+    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})
 
     train_message = Message("Missing spacy.")
 
@@ -114,7 +116,7 @@ def test_text_featurizer_missing_spacy_nlp():
     ],
 )
 def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
-    featurizer = TextFeaturizer({"features": [["pos", "pos2"]]})
+    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})
 
     train_message = Message(sentence)
     test_message = Message(sentence)

From c56ad4199c6f1764797cacdd7588d49c18ff0ddb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 15:17:36 +0100
Subject: [PATCH 126/633] update changelog

---
 changelog/5065.feature.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/changelog/5065.feature.rst b/changelog/5065.feature.rst
index f65d068ab872..1f2ea11be93b 100644
--- a/changelog/5065.feature.rst
+++ b/changelog/5065.feature.rst
@@ -1,4 +1,5 @@
-Add ``TextFeaturizer`` to sparse featurizers.
+Add ``LexicalSyntacticFeaturizer`` to sparse featurizers.
 
-``TextFeaturizer`` does the same featurization as the ``CRFEntityExtractor``. We extracted the featurization into
-a separate component so that the features can be reused and featurization is independent from the entity extraction.
\ No newline at end of file
+``LexicalSyntacticFeaturizer`` does the same featurization as the ``CRFEntityExtractor``. We extracted the
+featurization into a separate component so that the features can be reused and featurization is independent from the
+entity extraction.
\ No newline at end of file

From 4284b607ff68102595e01bcb0668bb03b6d278a8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 17 Jan 2020 17:57:33 +0100
Subject: [PATCH 127/633] Add RasaModelData

---
 rasa/utils/tf_model_data.py | 396 ++++++++++++++++++++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 rasa/utils/tf_model_data.py

diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tf_model_data.py
new file mode 100644
index 000000000000..7e9b461acc95
--- /dev/null
+++ b/rasa/utils/tf_model_data.py
@@ -0,0 +1,396 @@
+import numpy as np
+import scipy.sparse
+import tensorflow as tf
+
+from sklearn.model_selection import train_test_split
+from typing import Optional, Dict, Text, List, Tuple, Any, Union, Generator
+from collections import defaultdict
+
+from utils import train_utils
+
+
+class RasaModelData:
+    def __init__(self, data: Optional[Dict[Text, List[np.ndarray]]] = None):
+        if data is None:
+            self.data = {}
+        else:
+            self.data = data
+
+    def items(self):
+        return self.data.items()
+
+    def values(self):
+        return self.data.values()
+
+    def keys(self):
+        return self.data.keys()
+
+    def split(
+        self, number_of_test_examples: int, random_seed: int, label_key: Text
+    ) -> Tuple["RasaModelData", "RasaModelData"]:
+        """Create random hold out test set using stratified split."""
+
+        self._check_label_key(label_key)
+
+        label_ids = self._create_label_ids(self.data[label_key][0])
+        label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
+
+        self._check_train_test_sizes(number_of_test_examples, label_counts)
+
+        counts = np.array([label_counts[label] for label in label_ids])
+        multi_values = [v[counts > 1] for values in self.data.values() for v in values]
+        solo_values = [v[counts == 1] for values in self.data.values() for v in values]
+
+        output_values = train_test_split(
+            *multi_values,
+            test_size=number_of_test_examples,
+            random_state=random_seed,
+            stratify=label_ids[counts > 1],
+        )
+
+        return self._convert_train_test_split(output_values, solo_values)
+
+    def add_features(self, key: Text, features: List[np.ndarray]):
+        """Add list of features to data under specified key."""
+
+        if not features:
+            return
+
+        if key in self.data:
+            raise ValueError(f"Key '{key}' already exists in RasaModelData.")
+
+        self.data[key] = []
+
+        for data in features:
+            if data.size > 0:
+                self.data[key].append(data)
+
+        if not self.data[key]:
+            del self.data[key]
+
+    def add_mask(self, key: Text, from_key: Text):
+        """Calculate mask for given key and put it under specified key."""
+
+        if not self.data.get(from_key):
+            return
+
+        self.data[key] = []
+
+        for data in self.data[from_key]:
+            if data.size > 0:
+                # explicitly add last dimension to mask
+                # to track correctly dynamic sequences
+                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
+                self.data[key].append(mask)
+                break
+
+    def get_signature(self) -> Dict[Text, Tuple[bool, Tuple[int]]]:
+        """Get signature of RasaModelData.
+
+        Signature stores the shape and whether features are sparse or not for every
+        key."""
+
+        return {
+            key: [
+                (True if isinstance(v[0], scipy.sparse.spmatrix) else False, v[0].shape)
+                for v in values
+            ]
+            for key, values in self.data.items()
+        }
+
+    def shuffle(self) -> None:
+        """Shuffle session data."""
+
+        data_points = self.get_number_of_examples()
+        ids = np.random.permutation(data_points)
+        self.data = self._data_for_ids(ids)
+
+    # noinspection PyPep8Naming
+    def balance(self, batch_size: int, shuffle: bool, label_key: Text) -> None:
+        """Mix session data to account for class imbalance.
+
+        This batching strategy puts rare classes approximately in every other batch,
+        by repeating them. Mimics stratified batching, but also takes into account
+        that more populated classes should appear more often.
+        """
+
+        if label_key not in self.data or len(self.data[label_key]) > 1:
+            raise ValueError(f"Key '{label_key}' not in RasaModelData.")
+
+        label_ids = self._create_label_ids(self.data[label_key][0])
+
+        unique_label_ids, counts_label_ids = np.unique(
+            label_ids, return_counts=True, axis=0
+        )
+        num_label_ids = len(unique_label_ids)
+
+        # need to call every time, so that the data is shuffled inside each class
+        label_data = self._split_by_label_ids(label_ids, unique_label_ids)
+
+        data_idx = [0] * num_label_ids
+        num_data_cycles = [0] * num_label_ids
+        skipped = [False] * num_label_ids
+
+        new_data = defaultdict(list)
+        num_examples = self.get_number_of_examples()
+
+        while min(num_data_cycles) == 0:
+            if shuffle:
+                indices_of_labels = np.random.permutation(num_label_ids)
+            else:
+                indices_of_labels = range(num_label_ids)
+
+            for index in indices_of_labels:
+                if num_data_cycles[index] > 0 and not skipped[index]:
+                    skipped[index] = True
+                    continue
+                else:
+                    skipped[index] = False
+
+                index_batch_size = (
+                    int(counts_label_ids[index] / num_examples * batch_size) + 1
+                )
+
+                for k, values in label_data[index].items():
+                    for i, v in enumerate(values):
+                        if len(new_data[k]) < i + 1:
+                            new_data[k].append([])
+                        new_data[k][i].append(
+                            v[data_idx[index] : data_idx[index] + index_batch_size]
+                        )
+
+                data_idx[index] += index_batch_size
+                if data_idx[index] >= counts_label_ids[index]:
+                    num_data_cycles[index] += 1
+                    data_idx[index] = 0
+
+                if min(num_data_cycles) > 0:
+                    break
+
+        final_data = defaultdict(list)
+        for k, values in new_data.items():
+            for v in values:
+                final_data[k].append(np.concatenate(np.array(v)))
+
+        self.data = final_data
+
+    def get_number_of_examples(self) -> int:
+        """Obtain number of examples in session data.
+
+        Raise a ValueError if number of examples differ for different data in
+        session data.
+        """
+
+        example_lengths = [v.shape[0] for values in self.data.values() for v in values]
+
+        # check if number of examples is the same for all values
+        if not all(length == example_lengths[0] for length in example_lengths):
+            raise ValueError(
+                f"Number of examples differs for keys '{self.data.keys()}'. Number of "
+                f"examples should be the same for all data."
+            )
+
+        return example_lengths[0]
+
+    def get_feature_dimension(self, key: Text) -> int:
+        """Get the feature dimension of the given key."""
+
+        number_of_features = 0
+        for data in self.data[key]:
+            if data.size > 0:
+                number_of_features += data[0].shape[-1]
+
+        return number_of_features
+
+    def convert_to_tf_dataset(
+        self,
+        batch_size: int,
+        label_key: Text,
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ):
+        """Create tf dataset."""
+
+        shapes, types = self._get_shapes_types()
+
+        return tf.data.Dataset.from_generator(
+            lambda batch_size_: self._gen_batch(
+                batch_size_, label_key, batch_strategy, shuffle
+            ),
+            output_types=types,
+            output_shapes=shapes,
+            args=([batch_size]),
+        )
+
+    def _get_shapes_types(self) -> Tuple:
+        """Extract shapes and types from session data."""
+
+        types = []
+        shapes = []
+
+        def append_shape(v: np.ndarray):
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                shapes.append((None, v[0].ndim + 1))
+                shapes.append((None,))
+                shapes.append((v[0].ndim + 1))
+            elif v[0].ndim == 0:
+                shapes.append((None,))
+            elif v[0].ndim == 1:
+                shapes.append((None, v[0].shape[-1]))
+            else:
+                shapes.append((None, None, v[0].shape[-1]))
+
+        def append_type(v: np.ndarray):
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                types.append(tf.int64)
+                types.append(tf.float32)
+                types.append(tf.int64)
+            else:
+                types.append(tf.float32)
+
+        for values in self.data.values():
+            for v in values:
+                append_shape(v)
+                append_type(v)
+
+        return tuple(shapes), tuple(types)
+
+    def _gen_batch(
+        self,
+        batch_size: int,
+        label_key: Text,
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ) -> Generator[Tuple, None, None]:
+        """Generate batches."""
+
+        if shuffle:
+            self.shuffle()
+
+        if batch_strategy == "balanced":
+            self.balance(batch_size, shuffle, label_key)
+
+        num_examples = self.get_number_of_examples()
+        num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
+
+        for batch_num in range(num_batches):
+            start = batch_num * batch_size
+            end = start + batch_size
+
+            yield train_utils.prepare_batch(self.data, start, end)
+
+    def _check_train_test_sizes(
+        self, number_of_test_examples: int, label_counts: Dict[Any, int]
+    ):
+        """Check whether the test data set is too large or too small."""
+
+        number_of_total_examples = self.get_number_of_examples()
+
+        if number_of_test_examples >= number_of_total_examples - len(label_counts):
+            raise ValueError(
+                f"Test set of {number_of_test_examples} is too large. Remaining "
+                f"train set should be at least equal to number of classes "
+                f"{len(label_counts)}."
+            )
+        elif number_of_test_examples < len(label_counts):
+            raise ValueError(
+                f"Test set of {number_of_test_examples} is too small. It should "
+                f"be at least equal to number of classes {label_counts}."
+            )
+
+    def _data_for_ids(self, ids: np.ndarray):
+        """Filter session data by ids."""
+
+        new_data = defaultdict(list)
+        for k, values in self.data.items():
+            for v in values:
+                new_data[k].append(v[ids])
+        return new_data
+
+    def _split_by_label_ids(
+        self, label_ids: "np.ndarray", unique_label_ids: "np.ndarray"
+    ) -> List["RasaModelData"]:
+        """Reorganize session data into a list of session data with the same labels."""
+
+        label_data = []
+        for label_id in unique_label_ids:
+            ids = label_ids == label_id
+            label_data.append(RasaModelData(self._data_for_ids(ids)))
+        return label_data
+
+    def _check_label_key(self, label_key: Text):
+        if label_key not in self.data or len(self.data[label_key]) > 1:
+            raise ValueError(f"Key '{label_key}' not in RasaModelData.")
+
+    def _convert_train_test_split(
+        self, output_values: List[Any], solo_values: List[Any]
+    ) -> Tuple["RasaModelData", "RasaModelData"]:
+        """Convert the output of sklearn.model_selection.train_test_split into train and
+        eval session data."""
+
+        data_train = defaultdict(list)
+        data_val = defaultdict(list)
+
+        # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
+        # order is kept, e.g. same order as session data keys
+
+        # train datasets have an even index
+        index = 0
+        for key, values in self.data.items():
+            for _ in range(len(values)):
+                data_train[key].append(
+                    self._combine_features(output_values[index * 2], solo_values[index])
+                )
+                index += 1
+
+        # val datasets have an odd index
+        index = 0
+        for key, values in self.data.items():
+            for _ in range(len(values)):
+                data_val[key].append(output_values[(index * 2) + 1])
+                index += 1
+
+        return RasaModelData(data_train), RasaModelData(data_val)
+
+    @staticmethod
+    def _combine_features(
+        feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
+        feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
+    ) -> Union[np.ndarray, scipy.sparse.spmatrix]:
+        """Concatenate features."""
+
+        if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
+            feature_2, scipy.sparse.spmatrix
+        ):
+            if feature_2.shape[0] == 0:
+                return feature_1
+            if feature_1.shape[0] == 0:
+                return feature_2
+            return scipy.sparse.vstack([feature_1, feature_2])
+
+        return np.concatenate([feature_1, feature_2])
+
+    @staticmethod
+    def _create_label_ids(label_ids: np.ndarray) -> np.ndarray:
+        """Convert various size label_ids into single dim array.
+
+        For multi-label y, map each distinct row to a string representation
+        using join because str(row) uses an ellipsis if len(row) > 1000.
+        Idea taken from sklearn's stratify split.
+        """
+
+        if label_ids.ndim == 1:
+            return label_ids
+
+        if label_ids.ndim == 2 and label_ids.shape[-1] == 1:
+            return label_ids[:, 0]
+
+        if label_ids.ndim == 2:
+            return np.array([" ".join(row.astype("str")) for row in label_ids])
+
+        if label_ids.ndim == 3 and label_ids.shape[-1] == 1:
+            return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
+
+        raise ValueError("Unsupported label_ids dimensions")

From 20f31a30c4ef14acb714865f5975fa7689f07dc9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 10:15:50 +0100
Subject: [PATCH 128/633] Use token directly in LexicalSyntaticFeaturizer

---
 docs/nlu/components.rst                       |  12 +-
 .../lexical_syntactic_featurizer.py           | 164 ++++++------------
 rasa/nlu/tokenizers/spacy_tokenizer.py        |  19 +-
 ...y => test_lexical_syntactic_featurizer.py} |  20 +--
 tests/nlu/tokenizers/test_spacy_tokenizer.py  |  18 ++
 5 files changed, 100 insertions(+), 133 deletions(-)
 rename tests/nlu/featurizers/{test_text_featurizer.py => test_lexical_syntactic_featurizer.py} (88%)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 1d815f04971f..ecd11ba71940 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -315,10 +315,8 @@ LexicalSyntacticFeaturizer
 :Type: Sparse featurizer
 :Description:
     Creates features for entity extraction.
-    Moves with a sliding window over every token in the user message and creates features according to the configured
-    features (see below).
-    Features could for example be if a token is upper case, if it is a digit, or the prefix of that token (e.g.
-    first two characters).
+    Moves with a sliding window over every token in the user message and creates features according to the
+    configuration (see below).
 :Configuration:
     You need to configure what kind of lexical and syntactic features the featurizer should extract.
     The following features are available:
@@ -336,8 +334,8 @@ LexicalSyntacticFeaturizer
     suffix3         Take the last three characters of the word.
     suffix2         Take the last two characters of the word.
     suffix1         Take the last character of the word.
-    pos             Take the Part-of-Speech tag of the word (spaCy required).
-    pos2            Take the first two characters of the Part-of-Speech tag of the word (spaCy required).
+    pos             Take the Part-of-Speech tag of the word.
+    pos2            Take the first two characters of the Part-of-Speech tag of the word.
     ==============  =============================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
@@ -368,7 +366,7 @@ LexicalSyntacticFeaturizer
 
     This configuration is also the default configuration.
 
-    .. note:: If you want to make use of ``pos`` or ``pos2`` you need to add ``SpacyNLP`` to your pipeline.
+    .. note:: If you want to make use of ``pos`` or ``pos2`` you need to add ``SpacyTokenizer`` to your pipeline.
 
 
 Intent Classifiers
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 21164a9f1dd4..16141b27ae68 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -8,31 +8,17 @@
 import scipy.sparse
 from typing import Any, Dict, Optional, Text, List
 
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    TOKENS_NAMES,
-    TEXT_ATTRIBUTE,
-    SPARSE_FEATURE_NAMES,
-    SPACY_DOCS,
-)
+from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
 
 logger = logging.getLogger(__name__)
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.model import Metadata
 
-try:
-    import spacy
-except ImportError:
-    spacy = None
-
-
-class Word(typing.NamedTuple):
-    text: Text
-    pos_tag: Text
-
 
 class LexicalSyntacticFeaturizer(Featurizer):
 
@@ -45,7 +31,7 @@ class LexicalSyntacticFeaturizer(Featurizer):
         # after holding keys about which features to use for each word,
         # for example, 'title' in array before will have the feature
         # "is the preceding word in title case?"
-        # POS features require spaCy to be installed
+        # POS features require 'SpacyTokenizer'.
         "features": [
             ["low", "title", "upper"],
             [
@@ -64,18 +50,20 @@ class LexicalSyntacticFeaturizer(Featurizer):
     }
 
     function_dict = {
-        "low": lambda word: word.text.islower(),
-        "title": lambda word: word.text.istitle(),
-        "prefix5": lambda word: word.text[:5],
-        "prefix2": lambda word: word.text[:2],
-        "suffix5": lambda word: word.text[-5:],
-        "suffix3": lambda word: word.text[-3:],
-        "suffix2": lambda word: word.text[-2:],
-        "suffix1": lambda word: word.text[-1:],
-        "pos": lambda word: word.pos_tag,
-        "pos2": lambda word: word.pos_tag[:2],
-        "upper": lambda word: word.text.isupper(),
-        "digit": lambda word: word.text.isdigit(),
+        "low": lambda token: token.text.islower(),
+        "title": lambda token: token.text.istitle(),
+        "prefix5": lambda token: token.text[:5],
+        "prefix2": lambda token: token.text[:2],
+        "suffix5": lambda token: token.text[-5:],
+        "suffix3": lambda token: token.text[-3:],
+        "suffix2": lambda token: token.text[-2:],
+        "suffix1": lambda token: token.text[-1:],
+        "pos": lambda token: token.data.get("pos") if "pos" in token.data else None,
+        "pos2": lambda token: token.data.get("pos")[:2]
+        if "pos" in token.data
+        else None,
+        "upper": lambda token: token.text.isupper(),
+        "digit": lambda token: token.text.isdigit(),
     }
 
     def __init__(
@@ -85,26 +73,7 @@ def __init__(
     ):
         super().__init__(component_config)
 
-        if feature_to_idx_dict is None:
-            self.feature_to_idx_dict = {}
-        else:
-            self.feature_to_idx_dict = feature_to_idx_dict
-
-        self._check_pos_features_and_spacy()
-
-    def _check_pos_features_and_spacy(self):
-        import itertools
-
-        features = set(
-            itertools.chain.from_iterable(self.component_config.get("features", []))
-        )
-        self.pos_features = "pos" in features or "pos2" in features
-
-        if self.pos_features and spacy is None:
-            raise ImportError(
-                "Failed to import `spaCy`. `spaCy` is required for POS features. "
-                "See https://spacy.io/usage/ for installation instructions."
-            )
+        self.feature_to_idx_dict = feature_to_idx_dict or {}
 
     def train(
         self,
@@ -124,24 +93,26 @@ def _create_text_features(self, message: Message) -> None:
         """Convert incoming messages into sparse features using the configured
         features."""
 
-        words = self._convert_to_words(message)
-        word_features = self._words_to_features(words)
-        features = self._features_to_one_hot(word_features)
+        # [:-1] to remove CLS token
+        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+
+        features = self._tokens_to_features(tokens)
+        features = self._features_to_one_hot(features)
         features = self._combine_with_existing_sparse_features(
             message, features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
         )
         message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], features)
 
     def _features_to_one_hot(
-        self, word_features: List[Dict[Text, Any]]
+        self, features: List[Dict[Text, Any]]
     ) -> scipy.sparse.spmatrix:
         """Convert the word features into a one-hot presentation using the indices
         in the feature-to-idx dictionary."""
 
-        vec = self._initialize_feature_vector(len(word_features))
+        vec = self._initialize_feature_vector(len(features))
 
-        for word_idx, word_features in enumerate(word_features):
-            for feature_key, feature_value in word_features.items():
+        for word_idx, features in enumerate(features):
+            for feature_key, feature_value in features.items():
                 if (
                     feature_key in self.feature_to_idx_dict
                     and str(feature_value) in self.feature_to_idx_dict[feature_key]
@@ -176,13 +147,15 @@ def _create_feature_to_idx_dict(
 
         Each feature key, defined in the component configuration, points to
         different feature values and their indices in the overall resulting
-        feature vector."""
+        feature vector.
+        """
 
         # get all possible feature values
         features = []
         for example in training_data.training_examples:
-            words = self._convert_to_words(example)
-            features.append(self._words_to_features(words))
+            # [:-1] to remove CLS token
+            tokens = example.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+            features.append(self._tokens_to_features(tokens))
 
         # build vocabulary of features
         feature_vocabulary = defaultdict(set)
@@ -206,13 +179,13 @@ def _create_feature_to_idx_dict(
             offset += len(feature_values)
         return feature_to_idx_dict
 
-    def _words_to_features(self, words: List[Word]) -> List[Dict[Text, Any]]:
+    def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
         """Convert words into discrete features."""
 
         configured_features = self.component_config["features"]
-        words_features = []
+        features = []
 
-        for word_idx in range(len(words)):
+        for token_idx in range(len(tokens)):
             # get the window size (e.g. before, word, after) of the configured features
             # in case of an even number we will look at one more word before,
             # e.g. window size 4 will result in a window range of
@@ -223,65 +196,40 @@ def _words_to_features(self, words: List[Word]) -> List[Dict[Text, Any]]:
 
             prefixes = [str(i) for i in window_range]
 
-            word_features = {}
+            token_features = {}
 
             for pointer_position in window_range:
-                current_idx = word_idx + pointer_position
+                current_idx = token_idx + pointer_position
 
                 # skip, if current_idx is pointing to a non-existing word
-                if current_idx < 0 or current_idx >= len(words):
+                if current_idx < 0 or current_idx >= len(tokens):
                     continue
 
                 # check if we are at the start or at the end
-                if word_idx == len(words) - 1 and pointer_position == 0:
-                    word_features["EOS"] = True
-                elif word_idx == 0 and pointer_position == 0:
-                    word_features["BOS"] = True
+                if token_idx == len(tokens) - 1 and pointer_position == 0:
+                    token_features["EOS"] = True
+                elif token_idx == 0 and pointer_position == 0:
+                    token_features["BOS"] = True
 
-                word = words[word_idx + pointer_position]
+                token = tokens[token_idx + pointer_position]
 
                 current_feature_idx = pointer_position + half_window_size
                 prefix = prefixes[current_feature_idx]
-                features = configured_features[current_feature_idx]
 
-                for feature in features:
+                for feature in configured_features[current_feature_idx]:
                     # append each feature to a feature vector
-                    value = self.function_dict[feature](word)
-                    word_features[prefix + ":" + feature] = value
-
-            words_features.append(word_features)
-
-        return words_features
-
-    def _convert_to_words(self, message: Message) -> List[Word]:
-        """Takes a sentence and switches it to crfsuite format."""
-
-        words = []
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-            if not tokens:
-                raise ValueError(
-                    f"Missing '{SPACY_DOCS[TEXT_ATTRIBUTE]}'. "
-                    f"Make sure to add 'SpacyNLP' to your pipeline."
-                )
-        else:
-            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
-            # remove CLS token
-            tokens = tokens[:-1]
-
-        for i, token in enumerate(tokens):
-            pos_tag = self._tag_of_token(token) if self.pos_features else None
-
-            words.append(Word(token.text, pos_tag))
-
-        return words
-
-    @staticmethod
-    def _tag_of_token(token):
-        if spacy.about.__version__ > "2" and token._.has("tag"):
-            return token._.get("tag")
-        else:
-            return token.tag_
+                    value = self.function_dict[feature](token)
+                    if value is None:
+                        logger.debug(
+                            f"Invalid value '{value}' for feature '{feature}'."
+                            f" Feature is ignored."
+                        )
+                        continue
+                    token_features[prefix + ":" + feature] = value
+
+            features.append(token_features)
+
+        return features
 
     @classmethod
     def load(
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 9a66cb6522e9..6e1b8462c9bd 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,5 +1,5 @@
 import typing
-from typing import Text, List
+from typing import Text, List, Any
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message
@@ -9,6 +9,11 @@
 if typing.TYPE_CHECKING:
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
+try:
+    import spacy
+except ImportError:
+    spacy = None
+
 
 class SpacyTokenizer(Tokenizer):
 
@@ -29,4 +34,14 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         doc = self.get_doc(message, attribute)
 
-        return [Token(t.text, t.idx, lemma=t.lemma_) for t in doc]
+        return [
+            Token(t.text, t.idx, lemma=t.lemma_, data={"pos": self._tag_of_token(t)})
+            for t in doc
+        ]
+
+    @staticmethod
+    def _tag_of_token(token: Any) -> Text:
+        if spacy.about.__version__ > "2" and token._.has("tag"):
+            return token._.get("tag")
+        else:
+            return token.tag_
diff --git a/tests/nlu/featurizers/test_text_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
similarity index 88%
rename from tests/nlu/featurizers/test_text_featurizer.py
rename to tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
index 9d1c71a9fa8a..fba59dcf23e1 100644
--- a/tests/nlu/featurizers/test_text_featurizer.py
+++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -3,6 +3,7 @@
 
 import scipy.sparse
 
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
     LexicalSyntacticFeaturizer,
@@ -87,19 +88,6 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls):
     assert np.all(actual[-1] == expected_cls)
 
 
-def test_text_featurizer_missing_spacy_nlp():
-    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})
-
-    train_message = Message("Missing spacy.")
-
-    WhitespaceTokenizer().process(train_message)
-
-    with pytest.raises(ValueError) as excpetions:
-        featurizer.train(TrainingData([train_message]))
-
-    assert "Make sure to add 'SpacyNLP' to your pipeline." in str(excpetions.value)
-
-
 @pytest.mark.parametrize(
     "sentence, expected",
     [
@@ -121,12 +109,12 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
     train_message = Message(sentence)
     test_message = Message(sentence)
 
-    WhitespaceTokenizer().process(train_message)
-    WhitespaceTokenizer().process(test_message)
-
     train_message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
     test_message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
 
+    SpacyTokenizer().process(train_message)
+    SpacyTokenizer().process(test_message)
+
     featurizer.train(TrainingData([train_message]))
 
     featurizer.process(test_message)
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
index 66e8a2b80919..1a7c689798a1 100644
--- a/tests/nlu/tokenizers/test_spacy_tokenizer.py
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -41,6 +41,24 @@ def test_spacy(text, expected_tokens, expected_indices, spacy_nlp):
     assert [t.end for t in tokens] == [i[1] for i in expected_indices]
 
 
+@pytest.mark.parametrize(
+    "text, expected_pos_tags",
+    [
+        ("Forecast for lunch", ["NN", "IN", "NN"]),
+        ("Hello, how are you?", ["UH", ",", "WRB", "VBP", "PRP", "."]),
+    ],
+)
+def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp):
+    tk = SpacyTokenizer()
+
+    message = Message(text)
+    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
+
+    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)
+
+    assert [t.data.get("pos") for t in tokens] == expected_pos_tags
+
+
 @pytest.mark.parametrize(
     "text, expected_tokens, expected_indices",
     [

From a8bcc2a4c4467283c220c9319767f743d426b813 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 10:33:40 +0100
Subject: [PATCH 129/633] Add BOS and EOS to feature options.

---
 docs/nlu/components.rst                       | 36 ++++++++-------
 .../lexical_syntactic_featurizer.py           | 37 ++++++++-------
 .../test_lexical_syntactic_featurizer.py      | 45 ++++++++++++-------
 3 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index ecd11ba71940..e88b084a92c3 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -324,24 +324,26 @@ LexicalSyntacticFeaturizer
     ==============  =============================================================================================
     Feature Name    Description
     ==============  =============================================================================================
-    low             Checks if the word is lower case.
-    upper           Checks if the word is upper case.
-    title           Checks if the word starts with an uppercase character and all remaining characters are lowercased.
-    digit           Checks if the word contains just digits.
-    prefix5         Take the first five characters of the word.
-    prefix2         Take the first two characters of the word.
-    suffix5         Take the last five characters of the word.
-    suffix3         Take the last three characters of the word.
-    suffix2         Take the last two characters of the word.
-    suffix1         Take the last character of the word.
-    pos             Take the Part-of-Speech tag of the word.
-    pos2            Take the first two characters of the Part-of-Speech tag of the word.
+    BOS             Checks if the token is at the beginning of the sentence.
+    EOS             Checks if the token is at the end of the sentence.
+    low             Checks if the token is lower case.
+    upper           Checks if the token is upper case.
+    title           Checks if the token starts with an uppercase character and all remaining characters are lowercased.
+    digit           Checks if the token contains just digits.
+    prefix5         Take the first five characters of the token.
+    prefix2         Take the first two characters of the token.
+    suffix5         Take the last five characters of the token.
+    suffix3         Take the last three characters of the token.
+    suffix2         Take the last two characters of the token.
+    suffix1         Take the last character of the token.
+    pos             Take the Part-of-Speech tag of the token (spaCy required).
+    pos2            Take the first two characters of the Part-of-Speech tag of the token (spaCy required).
     ==============  =============================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
     previous words, the current word in the sliding window, and the next words.
-    You define the features as [before, word, after] array.
-    If you, for example, want to define features for the word before, the current word, and the word after,
+    You define the features as [before, token, after] array.
+    If you, for example, want to define features for the token before, the current token, and the token after,
     your features configuration could look like this:
 
     .. code-block:: yaml
@@ -349,8 +351,10 @@ LexicalSyntacticFeaturizer
         pipeline:
         - name: "LexicalSyntacticFeaturizer":
           "features": [
-            ["low", "title", "upper"],
+            ["BOS", "EOS", "low", "title", "upper"],
             [
+              "BOS",
+              "EOS",
               "low",
               "prefix5",
               "prefix2",
@@ -361,7 +365,7 @@ LexicalSyntacticFeaturizer
               "title",
               "digit",
             ],
-            ["low", "title", "upper"],
+            ["BOS", "EOS", "low", "title", "upper"],
           ]
 
     This configuration is also the default configuration.
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 16141b27ae68..c1e286fb124b 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -33,8 +33,10 @@ class LexicalSyntacticFeaturizer(Featurizer):
         # "is the preceding word in title case?"
         # POS features require 'SpacyTokenizer'.
         "features": [
-            ["low", "title", "upper"],
+            ["BOS", "EOS", "low", "title", "upper"],
             [
+                "BOS",
+                "EOS",
                 "low",
                 "prefix5",
                 "prefix2",
@@ -45,7 +47,7 @@ class LexicalSyntacticFeaturizer(Featurizer):
                 "title",
                 "digit",
             ],
-            ["low", "title", "upper"],
+            ["BOS", "EOS", "low", "title", "upper"],
         ]
     }
 
@@ -205,27 +207,28 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
                 if current_idx < 0 or current_idx >= len(tokens):
                     continue
 
-                # check if we are at the start or at the end
-                if token_idx == len(tokens) - 1 and pointer_position == 0:
-                    token_features["EOS"] = True
-                elif token_idx == 0 and pointer_position == 0:
-                    token_features["BOS"] = True
-
                 token = tokens[token_idx + pointer_position]
 
                 current_feature_idx = pointer_position + half_window_size
                 prefix = prefixes[current_feature_idx]
 
                 for feature in configured_features[current_feature_idx]:
-                    # append each feature to a feature vector
-                    value = self.function_dict[feature](token)
-                    if value is None:
-                        logger.debug(
-                            f"Invalid value '{value}' for feature '{feature}'."
-                            f" Feature is ignored."
-                        )
-                        continue
-                    token_features[prefix + ":" + feature] = value
+                    # check if we are at the start or at the end
+                    if feature == "EOS" or feature == "BOS":
+                        if token_idx + pointer_position == len(tokens) - 1:
+                            token_features["EOS"] = True
+                        elif token_idx + pointer_position == 0:
+                            token_features["BOS"] = True
+                    else:
+                        # append each feature to a feature vector
+                        value = self.function_dict[feature](token)
+                        if value is None:
+                            logger.debug(
+                                f"Invalid value '{value}' for feature '{feature}'."
+                                f" Feature is ignored."
+                            )
+                            continue
+                        token_features[prefix + ":" + feature] = value
 
             features.append(token_features)
 
diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
index fba59dcf23e1..4632c415f80f 100644
--- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
+++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -14,23 +14,37 @@
 
 
 @pytest.mark.parametrize(
-    "sentence, expected, expected_cls",
+    "sentence, expected_features",
     [
         (
             "hello goodbye hello",
-            [[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0]],
-            [[2.0, 3.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0]],
+            [
+                [0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0],
+                [1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
+                [1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0],
+                [2.0, 3.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0],
+            ],
         ),
         (
             "a 1 2",
-            [[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0]],
-            [[2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0]],
+            [
+                [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0],
+                [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0],
+                [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
+                [2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
+            ],
         ),
     ],
 )
-def test_text_featurizer(sentence, expected, expected_cls):
+def test_text_featurizer(sentence, expected_features):
     featurizer = LexicalSyntacticFeaturizer(
-        {"features": [["upper"], ["prefix2", "suffix2", "digit"], ["low"]]}
+        {
+            "features": [
+                ["BOS", "EOS", "upper"],
+                ["BOS", "EOS", "prefix2", "suffix2", "digit"],
+                ["BOS", "EOS", "low"],
+            ]
+        }
     )
 
     train_message = Message(sentence)
@@ -49,8 +63,7 @@ def test_text_featurizer(sentence, expected, expected_cls):
 
     actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
 
-    assert np.all(actual[0] == expected)
-    assert np.all(actual[-1] == expected_cls)
+    assert np.all(actual == expected_features)
 
 
 @pytest.mark.parametrize(
@@ -58,8 +71,8 @@ def test_text_featurizer(sentence, expected, expected_cls):
     [
         (
             "hello 123 hello 123 hello",
-            [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]],
-            [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 1.0, 1.0]],
+            [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]],
+            [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0]],
         )
     ],
 )
@@ -94,11 +107,11 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls):
         (
             "The sun is shining",
             [
-                [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
-                [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
-                [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0],
-                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0],
+                [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0],
+                [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
             ],
         )
     ],

From a9f11411e47b58c72c9c35090e9cf5d6dd4c74fe Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 20 Jan 2020 10:49:38 +0100
Subject: [PATCH 130/633] move train_on_batch to RasaModel

---
 .../embedding_intent_classifier.py            | 30 +------------
 rasa/utils/tf_models.py                       | 42 +++++++++++++++----
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fcbcb661b7e0..a8482c33e58e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1296,26 +1296,8 @@ def _train_losses_scores(
 
         return losses, scores
 
-    def train_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> None:
-        with tf.GradientTape() as tape:
-            losses, scores = self._train_losses_scores(batch_in)
-            regularization_loss = tf.math.add_n(self.losses)
-            pred_loss = tf.math.add_n(list(losses.values()))
-            total_loss = pred_loss + regularization_loss
-
-        gradients = tape.gradient(total_loss, self.trainable_variables)
-        self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-
-        self.train_metrics["t_loss"].update_state(total_loss)
-        for k, v in losses.items():
-            self.train_metrics[k].update_state(v)
-        for k, v in scores.items():
-            self.train_metrics[k].update_state(v)
-
     def train_dataset(
-        self, batch_size: "tf.Tensor", session_data: SessionDataType
+        self, batch_size: Union[int, tf.Tensor], session_data: SessionDataType
     ) -> tf.data.Dataset:
         return train_utils.create_tf_dataset(
             session_data,
@@ -1325,16 +1307,6 @@ def train_dataset(
             shuffle=True,
         )
 
-    def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
-        losses, scores = self._train_losses_scores(batch_in)
-        total_loss = tf.math.add_n(list(losses.values())) + self.losses
-
-        self.eval_metrics["val_t_loss"].update_state(total_loss)
-        for k, v in losses.items():
-            self.eval_metrics[f"val_{k}"].update_state(v)
-        for k, v in scores.items():
-            self.eval_metrics[f"val_{k}"].update_state(v)
-
     def eval_dataset(
         self, batch_size: "tf.Tensor", session_data: Optional[SessionDataType]
     ) -> tf.data.Dataset:
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index b302c208876a..30a5a6e3c8ba 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -11,7 +11,10 @@
 logger = logging.getLogger(__name__)
 
 
+# noinspection PyMethodOverriding
 class RasaModel(tf.keras.models.Model):
+    """Completely override all public methods of keras Model."""
+
     @staticmethod
     def _update_postfix_dict(
         postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
@@ -62,17 +65,17 @@ def fit(
         if evaluate_on_num_examples > 0:
             if eager:
                 eval_dataset_func = lambda x: self.eval_dataset(x, eval_session_data)
-                eval_func = self.eval
+                evaluate_on_batch_func = self.evaluate_on_batch
             else:
                 eval_dataset_func = tf.function(
                     func=lambda x: self.eval_dataset(x, eval_session_data)
                 )
-                eval_func = tf.function(
-                    self.eval, input_signature=[eval_dataset_func(1).element_spec]
+                evaluate_on_batch_func = tf.function(
+                    self.evaluate_on_batch, input_signature=[eval_dataset_func(1).element_spec]
                 )
         else:
             eval_dataset_func = None
-            eval_func = None
+            evaluate_on_batch_func = None
 
         for ep in pbar:
             ep_batch_size = tf_batch_size * train_utils.linearly_increasing_batch_size(
@@ -108,8 +111,8 @@ def fit(
 
                     # Eval on batches
                     self.set_training_phase(False)
-                    for batch_in in eval_dataset_func(ep_batch_size, eval_session_data):
-                        eval_func(batch_in)
+                    for batch_in in eval_dataset_func(ep_batch_size):
+                        evaluate_on_batch_func(batch_in)
 
                 # Get the metric results
                 postfix_dict.update(
@@ -136,7 +139,32 @@ def predict(
     def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> None:
-        raise NotImplementedError
+        with tf.GradientTape() as tape:
+            losses, scores = self._train_losses_scores(batch_in)
+            regularization_loss = tf.math.add_n(self.losses)
+            pred_loss = tf.math.add_n(list(losses.values()))
+            total_loss = pred_loss + regularization_loss
+
+        gradients = tape.gradient(total_loss, self.trainable_variables)
+        self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+
+        self.train_metrics["t_loss"].update_state(total_loss)
+        for k, v in losses.items():
+            self.train_metrics[k].update_state(v)
+        for k, v in scores.items():
+            self.train_metrics[k].update_state(v)
+
+    def evaluate_on_batch(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
+        losses, scores = self._train_losses_scores(batch_in)
+        regularization_loss = tf.math.add_n(self.losses)
+        pred_loss = tf.math.add_n(list(losses.values()))
+        total_loss = pred_loss + regularization_loss
+
+        self.eval_metrics["val_t_loss"].update_state(total_loss)
+        for k, v in losses.items():
+            self.eval_metrics[f"val_{k}"].update_state(v)
+        for k, v in scores.items():
+            self.eval_metrics[f"val_{k}"].update_state(v)
 
     def test_on_batch(self) -> None:
         raise NotImplemented

From 5770aae6fd8924e0e0d2c85f36b395fb6e5f82a0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 10:59:24 +0100
Subject: [PATCH 131/633] refactor featurizer

---
 docs/nlu/components.rst                       |  4 +-
 .../lexical_syntactic_featurizer.py           | 79 +++++++++++++------
 2 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index e88b084a92c3..fc4e5ee34904 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -351,7 +351,7 @@ LexicalSyntacticFeaturizer
         pipeline:
         - name: "LexicalSyntacticFeaturizer":
           "features": [
-            ["BOS", "EOS", "low", "title", "upper"],
+            ["low", "title", "upper"],
             [
               "BOS",
               "EOS",
@@ -365,7 +365,7 @@ LexicalSyntacticFeaturizer
               "title",
               "digit",
             ],
-            ["BOS", "EOS", "low", "title", "upper"],
+            ["low", "title", "upper"],
           ]
 
     This configuration is also the default configuration.
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index c1e286fb124b..57dad20186c9 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -33,7 +33,7 @@ class LexicalSyntacticFeaturizer(Featurizer):
         # "is the preceding word in title case?"
         # POS features require 'SpacyTokenizer'.
         "features": [
-            ["BOS", "EOS", "low", "title", "upper"],
+            ["low", "title", "upper"],
             [
                 "BOS",
                 "EOS",
@@ -47,7 +47,7 @@ class LexicalSyntacticFeaturizer(Featurizer):
                 "title",
                 "digit",
             ],
-            ["BOS", "EOS", "low", "title", "upper"],
+            ["low", "title", "upper"],
         ]
     }
 
@@ -160,17 +160,18 @@ def _create_feature_to_idx_dict(
             features.append(self._tokens_to_features(tokens))
 
         # build vocabulary of features
-        feature_vocabulary = defaultdict(set)
-        for sent_features in features:
-            for word_features in sent_features:
-                for feature_name, feature_value in word_features.items():
-                    feature_vocabulary[feature_name].add(feature_value)
-
-        feature_vocabulary = OrderedDict(sorted(feature_vocabulary.items()))
+        feature_vocabulary = self._build_feature_vocabulary(features)
 
         # assign a unique index to each feature value
+        return self._map_features_to_indices(feature_vocabulary)
+
+    @staticmethod
+    def _map_features_to_indices(
+        feature_vocabulary: Dict[Text, List[Text]]
+    ) -> Dict[Text, Dict[Text:int]]:
         feature_to_idx_dict = {}
         offset = 0
+
         for feature_name, feature_values in feature_vocabulary.items():
             feature_to_idx_dict[feature_name] = {
                 str(feature_value): feature_idx
@@ -179,8 +180,25 @@ def _create_feature_to_idx_dict(
                 )
             }
             offset += len(feature_values)
+
         return feature_to_idx_dict
 
+    @staticmethod
+    def _build_feature_vocabulary(
+        features: List[List[Dict[Text, Any]]]
+    ) -> Dict[Text, List[Text]]:
+        feature_vocabulary = defaultdict(set)
+
+        for sent_features in features:
+            for word_features in sent_features:
+                for feature_name, feature_value in word_features.items():
+                    feature_vocabulary[feature_name].add(feature_value)
+
+        # sort items to ensure same order every time (for tests)
+        feature_vocabulary = OrderedDict(sorted(feature_vocabulary.items()))
+
+        return feature_vocabulary
+
     def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
         """Convert words into discrete features."""
 
@@ -203,7 +221,7 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
             for pointer_position in window_range:
                 current_idx = token_idx + pointer_position
 
-                # skip, if current_idx is pointing to a non-existing word
+                # skip, if current_idx is pointing to a non-existing token
                 if current_idx < 0 or current_idx >= len(tokens):
                     continue
 
@@ -213,27 +231,36 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
                 prefix = prefixes[current_feature_idx]
 
                 for feature in configured_features[current_feature_idx]:
-                    # check if we are at the start or at the end
-                    if feature == "EOS" or feature == "BOS":
-                        if token_idx + pointer_position == len(tokens) - 1:
-                            token_features["EOS"] = True
-                        elif token_idx + pointer_position == 0:
-                            token_features["BOS"] = True
-                    else:
-                        # append each feature to a feature vector
-                        value = self.function_dict[feature](token)
-                        if value is None:
-                            logger.debug(
-                                f"Invalid value '{value}' for feature '{feature}'."
-                                f" Feature is ignored."
-                            )
-                            continue
-                        token_features[prefix + ":" + feature] = value
+                    token_features[prefix + ":" + feature] = self._get_feature_value(
+                        feature, token, token_idx, pointer_position, len(tokens)
+                    )
 
             features.append(token_features)
 
         return features
 
+    def _get_feature_value(
+        self,
+        feature: Text,
+        token: Token,
+        token_idx: int,
+        pointer_position: int,
+        token_length: int,
+    ):
+        if feature == "EOS":
+            return token_idx + pointer_position == token_length - 1
+
+        if feature == "BOS":
+            return token_idx + pointer_position == 0
+
+        value = self.function_dict[feature](token)
+        if value is None:
+            logger.debug(
+                f"Invalid value '{value}' for feature '{feature}'."
+                f" Feature is ignored."
+            )
+        return value
+
     @classmethod
     def load(
         cls,

From f1cda5963d665f2046826f3a5071c6cef2743c63 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 10:59:55 +0100
Subject: [PATCH 132/633] refactor featurizer

---
 .../lexical_syntactic_featurizer.py           |  4 +++-
 .../test_lexical_syntactic_featurizer.py      | 23 +++++++++----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 57dad20186c9..eecbb3c40726 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -168,7 +168,7 @@ def _create_feature_to_idx_dict(
     @staticmethod
     def _map_features_to_indices(
         feature_vocabulary: Dict[Text, List[Text]]
-    ) -> Dict[Text, Dict[Text:int]]:
+    ) -> Dict[Text, Dict[Text, int]]:
         feature_to_idx_dict = {}
         offset = 0
 
@@ -181,6 +181,8 @@ def _map_features_to_indices(
             }
             offset += len(feature_values)
 
+        print(feature_to_idx_dict)
+
         return feature_to_idx_dict
 
     @staticmethod
diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
index 4632c415f80f..acd89825e61a 100644
--- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
+++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -19,19 +19,18 @@
         (
             "hello goodbye hello",
             [
-                [0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0],
-                [1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
-                [1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0],
-                [2.0, 3.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0],
+                [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0],
+                [0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0],
+                [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+                [1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0],
             ],
         ),
         (
-            "a 1 2",
+            "a 1",
             [
-                [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0],
-                [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0],
-                [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
-                [2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
+                [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0],
+                [1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
             ],
         ),
     ],
@@ -40,9 +39,9 @@ def test_text_featurizer(sentence, expected_features):
     featurizer = LexicalSyntacticFeaturizer(
         {
             "features": [
-                ["BOS", "EOS", "upper"],
-                ["BOS", "EOS", "prefix2", "suffix2", "digit"],
-                ["BOS", "EOS", "low"],
+                ["BOS", "upper"],
+                ["BOS", "EOS", "prefix2", "digit"],
+                ["EOS", "low"],
             ]
         }
     )

From 8559e168dfb5b5c45e5ff7e59655eabd2e76f43c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 11:04:55 +0100
Subject: [PATCH 133/633] update test

---
 tests/nlu/training/test_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 4c563174ed40..04a889a6b2ae 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -34,7 +34,7 @@ def pipelines_for_tests():
                 "MitieFeaturizer",
                 "SpacyFeaturizer",
                 "RegexFeaturizer",
-                "TextFeaturizer",
+                "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
                 "ConveRTFeaturizer",
                 "MitieEntityExtractor",

From 2979c97acc088c11606064170ab7fe4e0c297698 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 12:42:47 +0100
Subject: [PATCH 134/633] fix formatting issues

---
 .../embedding_intent_classifier.py            | 13 ++------
 rasa/utils/tf_models.py                       | 30 ++++++++++---------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fcbcb661b7e0..a409eadb6ed6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -854,8 +854,8 @@ def load(
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
-            _tf_config = pickle.load(f)
+        # with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
+        #    _tf_config = pickle.load(f)
 
         with open(
             os.path.join(model_dir, file_name + ".session_data_example.pkl"), "rb"
@@ -1376,15 +1376,6 @@ def predict(
             sim_all = self._loss_label.sim(
                 cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
             )
-            # label = self._create_bow(
-            #     tf_batch_data["label_features"],
-            #     tf_batch_data["label_mask"][0],
-            #     "label",
-            # )
-            # label_embed = self._embed["label"](label)
-            # sim = train_utils.tf_raw_sim(
-            #     cls_embed[:, tf.newaxis, :], label_embed, None
-            # )
 
             scores = train_utils.confidence_from_sim(
                 sim_all, self.config[SIMILARITY_TYPE]
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index b302c208876a..ca33ff5aea99 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -45,15 +45,19 @@ def fit(
 
         tf_batch_size = tf.ones((), tf.int32)
 
+        def train_dataset_function(x):
+            return self.train_dataset(x, session_data)
+
+        def eval_dataset_function(x):
+            return self.eval_dataset(x, eval_session_data)
+
         if eager:
             # allows increasing batch size
-            train_dataset_func = lambda x: self.train_dataset(x, session_data)
+            train_dataset_func = train_dataset_function
             train_on_batch_func = self.train_on_batch
         else:
             # allows increasing batch size
-            train_dataset_func = tf.function(
-                func=lambda x: self.train_dataset(x, session_data)
-            )
+            train_dataset_func = tf.function(func=train_dataset_function)
             train_on_batch_func = tf.function(
                 self.train_on_batch,
                 input_signature=[train_dataset_func(1).element_spec],
@@ -61,12 +65,10 @@ def fit(
 
         if evaluate_on_num_examples > 0:
             if eager:
-                eval_dataset_func = lambda x: self.eval_dataset(x, eval_session_data)
+                eval_dataset_func = eval_dataset_function
                 eval_func = self.eval
             else:
-                eval_dataset_func = tf.function(
-                    func=lambda x: self.eval_dataset(x, eval_session_data)
-                )
+                eval_dataset_func = tf.function(func=eval_dataset_function)
                 eval_func = tf.function(
                     self.eval, input_signature=[eval_dataset_func(1).element_spec]
                 )
@@ -123,7 +125,7 @@ def fit(
             logger.info("Finished training.")
 
     def compile(self) -> None:
-        raise NotImplemented
+        raise NotImplementedError
 
     def evaluate(self) -> None:
         pass
@@ -139,16 +141,16 @@ def train_on_batch(
         raise NotImplementedError
 
     def test_on_batch(self) -> None:
-        raise NotImplemented
+        raise NotImplementedError
 
     def predict_on_batch(self) -> None:
-        raise NotImplemented
+        raise NotImplementedError
 
     def fit_generator(self) -> None:
-        raise NotImplemented
+        raise NotImplementedError
 
     def evaluate_generator(self) -> None:
-        raise NotImplemented
+        raise NotImplementedError
 
     def predict_generator(self) -> None:
-        raise NotImplemented
+        raise NotImplementedError

From c5b9dde6dcc59b3f6d0748fc6e0c439a8ad10aeb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 12:53:44 +0100
Subject: [PATCH 135/633] update RasaModel

---
 .../embedding_intent_classifier.py            |  4 ++--
 rasa/utils/tf_models.py                       | 23 ++++++++++---------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a409eadb6ed6..b0c5c49b8f73 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1297,7 +1297,7 @@ def _train_losses_scores(
         return losses, scores
 
     def train_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
         with tf.GradientTape() as tape:
             losses, scores = self._train_losses_scores(batch_in)
@@ -1350,7 +1350,7 @@ def build_for_predict(self, session_data: SessionDataType) -> None:
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
     def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> Dict[Text, tf.Tensor]:
         tf_batch_data = train_utils.batch_to_session_data(
             batch_in, self.session_data_signature
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index ca33ff5aea99..d12055f55da6 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -31,7 +31,8 @@ def fit(
         evaluate_on_num_examples: int,
         evaluate_every_num_epochs: int,
         silent: bool = False,
-        eager: bool = True,
+        eager: bool = False,
+        **kwargs,
     ) -> None:
         """Train tf graph"""
 
@@ -110,7 +111,7 @@ def eval_dataset_function(x):
 
                     # Eval on batches
                     self.set_training_phase(False)
-                    for batch_in in eval_dataset_func(ep_batch_size, eval_session_data):
+                    for batch_in in eval_dataset_func(ep_batch_size):
                         eval_func(batch_in)
 
                 # Get the metric results
@@ -124,33 +125,33 @@ def eval_dataset_function(x):
         if not disable:
             logger.info("Finished training.")
 
-    def compile(self) -> None:
+    def compile(self, **kwargs) -> None:
         raise NotImplementedError
 
-    def evaluate(self) -> None:
+    def evaluate(self, **kwargs) -> None:
         pass
 
     def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> Dict[Text, tf.Tensor]:
         pass
 
     def train_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
         raise NotImplementedError
 
-    def test_on_batch(self) -> None:
+    def test_on_batch(self, **kwargs) -> None:
         raise NotImplementedError
 
-    def predict_on_batch(self) -> None:
+    def predict_on_batch(self, **kwargs) -> None:
         raise NotImplementedError
 
-    def fit_generator(self) -> None:
+    def fit_generator(self, **kwargs) -> None:
         raise NotImplementedError
 
-    def evaluate_generator(self) -> None:
+    def evaluate_generator(self, **kwargs) -> None:
         raise NotImplementedError
 
-    def predict_generator(self) -> None:
+    def predict_generator(self, **kwargs) -> None:
         raise NotImplementedError

From 0df08c94f47fd6208ef8bf8a2231a15950719f09 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 12:57:41 +0100
Subject: [PATCH 136/633] fix docs

---
 docs/nlu/components.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index fc4e5ee34904..570a8969f9d2 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -306,7 +306,7 @@ CountVectorsFeaturizer
 
 
 LexicalSyntacticFeaturizer
-~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :Short: Lexical and syntactic feature creation to support entity extraction.
 :Outputs:

From 5c86ec57175942c1f6408fa26f7556cc64d0eead Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 20 Jan 2020 12:58:19 +0100
Subject: [PATCH 137/633] turn of eager by default

---
 rasa/utils/tf_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 30a5a6e3c8ba..aa68765af02f 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -34,7 +34,7 @@ def fit(
         evaluate_on_num_examples: int,
         evaluate_every_num_epochs: int,
         silent: bool = False,
-        eager: bool = True,
+        eager: bool = False,
     ) -> None:
         """Train tf graph"""
 

From 5a9d7bb56df1c727627f7166b03ccf4a7c64276a Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 20 Jan 2020 14:07:13 +0100
Subject: [PATCH 138/633] refactored code and requirements for tf 2.1

---
 ...irements_pretrained_embeddings_convert.txt |  4 +--
 .../dense_featurizer/convert_featurizer.py    | 28 ++++++++-----------
 rasa/nlu/tokenizers/convert_tokenizer.py      | 18 ++++--------
 3 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/alt_requirements/requirements_pretrained_embeddings_convert.txt b/alt_requirements/requirements_pretrained_embeddings_convert.txt
index 265cbaeee044..7a96d5bba9e3 100644
--- a/alt_requirements/requirements_pretrained_embeddings_convert.txt
+++ b/alt_requirements/requirements_pretrained_embeddings_convert.txt
@@ -1,5 +1,5 @@
 # Minimum Install Requirements
 -r ../requirements.txt
 
-tensorflow_text==1.15.1
-tensorflow_hub==0.6.0
+tensorflow_text==2.1.0rc0
+tensorflow_hub==0.7.0
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index f451e5950247..16a260351bb6 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -32,20 +32,12 @@ def _load_model(self) -> None:
         import tensorflow_text
         import tensorflow_hub as tfhub
 
-        self.graph = tf.Graph()
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
 
-        with self.graph.as_default():
-            self.session = tf.Session()
-            self.module = tfhub.Module(model_url)
+        self.module = tfhub.load(model_url)
 
-            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.sentence_encoding_tensor = self.module(self.text_placeholder)
-            self.sequence_encoding_tensor = self.module(
-                self.text_placeholder, signature="encode_sequence", as_dict=True
-            )
-            self.session.run(tf.tables_initializer())
-            self.session.run(tf.global_variables_initializer())
+        self.sentence_encoding_signature = self.module.signatures["default"]
+        self.sequence_encoding_signature = self.module.signatures["encode_sequence"]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
@@ -157,14 +149,16 @@ def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]:
         return texts
 
     def _sentence_encoding_of_text(self, batch: List[Text]) -> np.ndarray:
-        return self.session.run(
-            self.sentence_encoding_tensor, feed_dict={self.text_placeholder: batch}
-        )
+
+        return self.sentence_encoding_signature(tf.convert_to_tensor(batch))[
+            "default"
+        ].numpy()
 
     def _sequence_encoding_of_text(self, batch: List[Text]) -> np.ndarray:
-        return self.session.run(
-            self.sequence_encoding_tensor, feed_dict={self.text_placeholder: batch}
-        )["sequence_encoding"]
+
+        return self.sequence_encoding_signature(tf.convert_to_tensor(batch))[
+            "sequence_encoding"
+        ].numpy()
 
     def train(
         self,
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 8b61624c1c6a..620f396072c7 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -33,23 +33,17 @@ def _load_tokenizer_params(self):
         import tensorflow_text
         import tensorflow_hub as tfhub
 
-        self.graph = tf.Graph()
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
 
-        with self.graph.as_default():
-            self.session = tf.Session()
-            self.module = tfhub.Module(model_url)
+        self.module = tfhub.load(model_url)
 
-            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.tokenized = self.module(self.text_placeholder, signature="tokenize")
-
-            self.session.run(tf.tables_initializer())
-            self.session.run(tf.global_variables_initializer())
+        self.tokenize_signature = self.module.signatures["tokenize"]
 
     def _tokenize(self, sentence: Text) -> Any:
-        return self.session.run(
-            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
-        )
+
+        return self.tokenize_signature(tf.convert_to_tensor([sentence]))[
+            "default"
+        ].numpy()
 
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         """Tokenize the text using the ConveRT model.

From 0d9b39bc553e6b223fda1f5bd9cc21aceccb5a08 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 20 Jan 2020 14:17:53 +0100
Subject: [PATCH 139/633] update reqs and docs

---
 docs/nlu/choosing-a-pipeline.rst | 2 +-
 requirements.txt                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 96932cc953fa..2d3f60c4bb84 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -66,7 +66,7 @@ examples are already very similar, the intent classified for both is highly like
 large enough training data.
 
     .. note::
-        To use ``pretrained_embeddings_convert`` pipeline, you should install ``tensorflow-text==1.15.1`` and ``tensorflow-hub==0.6.0``. Otherwise, you can also pip install Rasa with ``pip install rasa[convert]``. Please also note that tensorflow-text is only currently supported on Linux platforms.
+        To use ``pretrained_embeddings_convert`` pipeline, you should install ``tensorflow-text==2.1.0rc0`` and ``tensorflow-hub==0.7.0``. Otherwise, you can also pip install Rasa with ``pip install rasa[convert]``. Please also note that tensorflow-text is only currently supported on Linux platforms.
 
 supervised_embeddings
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/requirements.txt b/requirements.txt
index 6476181da9be..b084163498db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -61,6 +61,6 @@ python-dateutil==2.8.0
 gast==0.2.2
 # for new featurizers
 tensorflow==2.1.0
-tensorflow_hub==0.6.0
+tensorflow_hub==0.7.0
 tensorflow-addons==0.7.0
 tensorflow-probability==0.7.0

From 4aea5c2fbf7a7164732903d806c5a9d4ba7cadb3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 14:19:24 +0100
Subject: [PATCH 140/633] move tf dataset generation to rasa model data

---
 .../embedding_intent_classifier.py            | 244 ++++++------------
 .../selectors/embedding_response_selector.py  |   2 +-
 rasa/utils/tf_model_data.py                   | 183 ++++++++++++-
 rasa/utils/tf_models.py                       |  60 +++--
 rasa/utils/train_utils.py                     |   5 +-
 tests/utils/test_tf_model_data.py             | 148 +++++++++++
 tests/utils/test_train_utils.py               |  52 ++--
 7 files changed, 468 insertions(+), 226 deletions(-)
 create mode 100644 tests/utils/test_tf_model_data.py

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b0c5c49b8f73..dcfe9948c0f3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -17,7 +17,6 @@
 from rasa.utils import train_utils
 from rasa.utils import tf_layers
 from rasa.utils import tf_models
-from rasa.utils.train_utils import SessionDataType, SessionDataSignature
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -26,10 +25,12 @@
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
 )
+from rasa.utils.tf_model_data import RasaModelData, DataSignature
 
 import tensorflow as tf
 import tensorflow_addons as tfa
 
+
 logger = logging.getLogger(__name__)
 
 from rasa.nlu.config import RasaNLUModelConfig
@@ -247,7 +248,7 @@ def __init__(
 
         self._tf_config = train_utils.load_tf_config(self.component_config)
 
-        self.session_data_example = None
+        self.model_data_example = None
 
     # training data helpers:
     @staticmethod
@@ -337,54 +338,10 @@ def _extract_and_add_features(
 
         return sparse_features, dense_features
 
-    @staticmethod
-    def _add_to_session_data(
-        session_data: SessionDataType, key: Text, features: List[np.ndarray]
-    ):
-        if not features:
-            return
-
-        session_data[key] = []
-
-        for data in features:
-            if data.size > 0:
-                session_data[key].append(data)
-
-        if not session_data[key]:
-            del session_data[key]
-
-    @staticmethod
-    def _add_mask_to_session_data(
-        session_data: SessionDataType, key: Text, from_key: Text
-    ):
-
-        if not session_data.get(from_key):
-            return
-
-        session_data[key] = []
-
-        for data in session_data[from_key]:
-            if data.size > 0:
-                # explicitly add last dimension to mask
-                # to track correctly dynamic sequences
-                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
-                session_data[key].append(mask)
-                break
-
-    @staticmethod
-    def _get_num_of_features(session_data: SessionDataType, key: Text) -> int:
-        num_features = 0
-        for data in session_data[key]:
-            if data.size > 0:
-                num_features += data[0].shape[-1]
-        return num_features
-
-    def check_input_dimension_consistency(self, session_data: SessionDataType):
+    def check_input_dimension_consistency(self, model_data: RasaModelData):
         if self.component_config[SHARE_HIDDEN_LAYERS]:
-            num_text_features = self._get_num_of_features(session_data, "text_features")
-            num_intent_features = self._get_num_of_features(
-                session_data, "label_features"
-            )
+            num_text_features = model_data.get_feature_dimension("text_features")
+            num_intent_features = model_data.get_feature_dimension("label_features")
 
             if num_text_features != num_intent_features:
                 raise ValueError(
@@ -433,7 +390,7 @@ def _create_label_data(
         training_data: TrainingData,
         label_id_dict: Dict[Text, int],
         attribute: Text,
-    ) -> SessionDataType:
+    ) -> RasaModelData:
         """Create matrix with label_ids encoded in rows as bag of words.
 
         Find a training example for each label and get the encoded features
@@ -462,9 +419,9 @@ def _create_label_data(
         else:
             features = self._compute_default_label_features(labels_example)
 
-        label_data = {}
-        self._add_to_session_data(label_data, "label_features", features)
-        self._add_mask_to_session_data(label_data, "label_mask", "label_features")
+        label_data = RasaModelData()
+        label_data.add_features("label_features", features)
+        label_data.add_mask("label_mask", "label_features")
 
         return label_data
 
@@ -478,13 +435,13 @@ def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]
             )
         ]
 
-    def _create_session_data(
+    def _create_model_data(
         self,
         training_data: List[Message],
         label_id_dict: Optional[Dict[Text, int]] = None,
         tag_id_dict: Optional[Dict[Text, int]] = None,
         label_attribute: Optional[Text] = None,
-    ) -> SessionDataType:
+    ) -> RasaModelData:
         """Prepare data for training and create a SessionDataType object"""
 
         X_sparse = []
@@ -526,29 +483,30 @@ def _create_session_data(
         label_ids = np.array(label_ids)
         tag_ids = np.array(tag_ids)
 
-        session_data = {}
-        self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
-        self._add_to_session_data(session_data, "label_features", [Y_sparse, Y_dense])
+        model_data = RasaModelData()
+        model_data.add_features("text_features", [X_sparse, X_dense])
+        model_data.add_features("label_features", [Y_sparse, Y_dense])
         if label_attribute and (
-            "label_features" not in session_data or not session_data["label_features"]
+            "label_features" not in model_data.keys()
+            or not model_data.get("label_features")
         ):
             # no label features are present, get default features from _label_data
-            session_data["label_features"] = self._use_default_label_features(label_ids)
+            model_data.set(
+                "label_features", self._use_default_label_features(label_ids)
+            )
 
         # explicitly add last dimension to label_ids
         # to track correctly dynamic sequences
-        self._add_to_session_data(
-            session_data, "label_ids", [np.expand_dims(label_ids, -1)]
-        )
-        self._add_to_session_data(session_data, "tag_ids", [tag_ids])
+        model_data.add_features("label_ids", [np.expand_dims(label_ids, -1)])
+        model_data.add_features("tag_ids", [tag_ids])
 
-        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
-        self._add_mask_to_session_data(session_data, "label_mask", "label_features")
+        model_data.add_mask("text_mask", "text_features")
+        model_data.add_mask("label_mask", "label_features")
 
-        return session_data
+        return model_data
 
     # train helpers
-    def preprocess_train_data(self, training_data: TrainingData) -> SessionDataType:
+    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         """Prepares data for training.
 
         Performs sanity checks on training data, extracts encodings for labels.
@@ -565,7 +523,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> SessionDataType:
         tag_id_dict = self._create_tag_id_dict(training_data)
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
-        session_data = self._create_session_data(
+        model_data = self._create_model_data(
             training_data.training_examples,
             label_id_dict,
             tag_id_dict,
@@ -574,13 +532,13 @@ def preprocess_train_data(self, training_data: TrainingData) -> SessionDataType:
 
         self.num_tags = len(self.inverted_tag_dict)
 
-        self.check_input_dimension_consistency(session_data)
+        self.check_input_dimension_consistency(model_data)
 
-        return session_data
+        return model_data
 
     @staticmethod
-    def _check_enough_labels(session_data: SessionDataType) -> bool:
-        return len(np.unique(session_data["label_ids"])) >= 2
+    def _check_enough_labels(model_data: RasaModelData) -> bool:
+        return len(np.unique(model_data.get("label_ids"))) >= 2
 
     def train(
         self,
@@ -595,10 +553,10 @@ def train(
         # set numpy random seed
         np.random.seed(self.component_config[RANDOM_SEED])
 
-        session_data = self.preprocess_train_data(training_data)
+        model_data = self.preprocess_train_data(training_data)
 
         if self.component_config[INTENT_CLASSIFICATION]:
-            possible_to_train = self._check_enough_labels(session_data)
+            possible_to_train = self._check_enough_labels(model_data)
 
             if not possible_to_train:
                 logger.error(
@@ -608,66 +566,42 @@ def train(
                 )
                 return
 
-        if self.component_config[EVAL_NUM_EXAMPLES]:
-            session_data, eval_session_data = train_utils.train_val_split(
-                session_data,
-                self.component_config[EVAL_NUM_EXAMPLES],
-                self.component_config[RANDOM_SEED],
-                label_key="label_ids",
-            )
-        else:
-            eval_session_data = None
-
         # keep one example for persisting and loading
-        self.session_data_example = {
-            k: [v[:1] for v in vs] for k, vs in session_data.items()
+        self.model_data_example = {
+            k: [v[:1] for v in vs] for k, vs in model_data.items()
         }
 
         # TODO set it in the model
         # set random seed
         tf.random.set_seed(self.component_config[RANDOM_SEED])
 
-        session_data_signature = self.create_signature(session_data)
+        model_data_signature = model_data.get_signature()
 
         self.model = DIET(
-            session_data_signature,
+            model_data_signature,
             self._label_data,
             self.inverted_tag_dict,
             self.component_config,
         )
 
         self.model.fit(
+            model_data,
             self.component_config[EPOCHS],
             self.component_config[BATCH_SIZES],
-            session_data,
-            eval_session_data,
             self.component_config[EVAL_NUM_EXAMPLES],
             self.component_config[EVAL_NUM_EPOCHS],
+            label_key="label_ids",
+            batch_strategy=self.component_config[BATCH_STRATEGY],
         )
 
-        # rebuild the graph for prediction
-        # self.model.build_for_predict()
-
-        # self.attention_weights = train_utils.extract_attention(self.attention_weights)
-
-    @staticmethod
-    def create_signature(session_data: SessionDataType):
-        return {
-            key: [
-                (True if isinstance(v[0], scipy.sparse.spmatrix) else False, v[0].shape)
-                for v in values
-            ]
-            for key, values in session_data.items()
-        }
-
     # process helpers
     def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
         if self.model is None or self.predict_func is None:
             return
 
         # create session data from message and convert it into a batch of 1
-        session_data = self._create_session_data([message])
-        predict_dataset = self.model.predict_dataset(session_data)
+        model_data = self._create_model_data([message])
+        predict_dataset = model_data.as_tf_dataset(1, label_key="label_ids")
         batch_in = next(iter(predict_dataset))
 
         return self.predict_func(batch_in)
@@ -808,9 +742,9 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         self.model.save_weights(tf_model_file, save_format="tf")
 
         with open(
-            os.path.join(model_dir, file_name + ".session_data_example.pkl"), "wb"
+            os.path.join(model_dir, file_name + ".model_data_example.pkl"), "wb"
         ) as f:
-            pickle.dump(self.session_data_example, f)
+            pickle.dump(self.model_data_example, f)
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "wb") as f:
             pickle.dump(self._label_data, f)
@@ -858,9 +792,9 @@ def load(
         #    _tf_config = pickle.load(f)
 
         with open(
-            os.path.join(model_dir, file_name + ".session_data_example.pkl"), "rb"
+            os.path.join(model_dir, file_name + ".model_data_example.pkl"), "rb"
         ) as f:
-            session_data_example = pickle.load(f)
+            model_data_example = RasaModelData(pickle.load(f))
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
             label_data = pickle.load(f)
@@ -884,27 +818,35 @@ def load(
             elif meta[LOSS_TYPE] == "margin":
                 meta[SIMILARITY_TYPE] = "cosine"
 
-        model = DIET(
-            EmbeddingIntentClassifier.create_signature(session_data_example),
-            label_data,
-            inv_tag_dict,
-            meta,
-        )
+        model = DIET(model_data_example.get_signature(), label_data, inv_tag_dict, meta)
 
         logger.debug("Loading the model ...")
-        model.fit(1, 1, session_data_example, None, 0, 0, silent=True, eager=True)
+        model.fit(
+            model_data_example,
+            1,
+            1,
+            0,
+            0,
+            label_key="label_ids",
+            batch_strategy=meta[BATCH_STRATEGY],
+            silent=True,
+            eager=True,
+        )
         model.load_weights(tf_model_file)
 
         # build the graph for prediction
         model.set_training_phase(False)
-        session_data = {k: vs for k, vs in session_data_example.items() if "text" in k}
-        model.session_data_signature = EmbeddingIntentClassifier.create_signature(
-            session_data
+        model_data = RasaModelData(
+            {k: vs for k, vs in model_data_example.items() if "text" in k}
+        )
+        model.data_signature = model_data.get_signature()
+        model.build_for_predict(model_data)
+
+        predict_dataset = model_data.as_tf_dataset(
+            1, label_key="label_ids", batch_strategy="sequence", shuffle=False
         )
-        model.build_for_predict(session_data)
-        predict_dataset = model.predict_dataset(session_data)
         predict_func = tf.function(
-            model.predict, input_signature=[predict_dataset.element_spec]
+            func=model.predict, input_signature=[predict_dataset.element_spec]
         )
         batch_in = next(iter(predict_dataset))
         predict_func(batch_in)
@@ -923,7 +865,7 @@ def load(
 class DIET(tf_models.RasaModel):
     @staticmethod
     def _create_sparse_dense_layer(
-        data_signature: List[Tuple[bool, List[int]]],
+        data_signature: List[DataSignature],
         name: Text,
         reg_lambda: float,
         dense_dim: int,
@@ -944,7 +886,7 @@ def _create_sparse_dense_layer(
             )
 
     @staticmethod
-    def _input_dim(data_signature: List[Tuple[bool, List[int]]], dense_dim: int) -> int:
+    def _input_dim(data_signature: List[DataSignature], dense_dim: int) -> int:
 
         for is_sparse, shape in data_signature:
             if not is_sparse:
@@ -957,18 +899,18 @@ def _input_dim(data_signature: List[Tuple[bool, List[int]]], dense_dim: int) ->
 
     def __init__(
         self,
-        session_data_signature: SessionDataSignature,
-        label_data: SessionDataType,
+        data_signature: Dict[Text, List[DataSignature]],
+        label_data: RasaModelData,
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
     ) -> None:
         super(DIET, self).__init__(name="DIET")
 
         # data
-        self.session_data_signature = session_data_signature
-        label_batch = train_utils.prepare_batch(label_data)
+        self.data_signature = data_signature
+        label_batch = label_data.prepare_batch()
         self.tf_label_data = train_utils.batch_to_session_data(
-            label_batch, EmbeddingIntentClassifier.create_signature(label_data)
+            label_batch, label_data.get_signature()
         )
         self._num_tags = len(inverted_tag_dict)
 
@@ -996,13 +938,13 @@ def _prepare_layers(self) -> None:
 
         self._sparse_to_dense = {
             "text": self._create_sparse_dense_layer(
-                self.session_data_signature["text_features"],
+                self.data_signature["text_features"],
                 "text",
                 self.config[C2],
                 self.config[DENSE_DIM]["text"],
             ),
             "label": self._create_sparse_dense_layer(
-                self.session_data_signature["label_features"],
+                self.data_signature["label_features"],
                 "label",
                 self.config[C2],
                 self.config[DENSE_DIM]["label"],
@@ -1249,9 +1191,7 @@ def _entity_loss(
     def _train_losses_scores(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
-        tf_batch_data = train_utils.batch_to_session_data(
-            batch_in, self.session_data_signature
-        )
+        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1314,17 +1254,6 @@ def train_on_batch(
         for k, v in scores.items():
             self.train_metrics[k].update_state(v)
 
-    def train_dataset(
-        self, batch_size: "tf.Tensor", session_data: SessionDataType
-    ) -> tf.data.Dataset:
-        return train_utils.create_tf_dataset(
-            session_data,
-            batch_size,
-            label_key="label_ids",
-            batch_strategy=self.config[BATCH_STRATEGY],
-            shuffle=True,
-        )
-
     def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
         losses, scores = self._train_losses_scores(batch_in)
         total_loss = tf.math.add_n(list(losses.values())) + self.losses
@@ -1335,16 +1264,8 @@ def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
         for k, v in scores.items():
             self.eval_metrics[f"val_{k}"].update_state(v)
 
-    def eval_dataset(
-        self, batch_size: "tf.Tensor", session_data: Optional[SessionDataType]
-    ) -> tf.data.Dataset:
-        if session_data is not None:
-            return train_utils.create_tf_dataset(
-                session_data, batch_size, label_key="label_ids"
-            )
-
-    def build_for_predict(self, session_data: SessionDataType) -> None:
-        self.batch_tuple_sizes = train_utils.batch_tuple_sizes(session_data)
+    def build_for_predict(self, session_data: RasaModelData) -> None:
+        self.batch_tuple_sizes = session_data.batch_tuple_sizes()
 
         all_labels_embed, _ = self._build_all_b()
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
@@ -1352,9 +1273,7 @@ def build_for_predict(self, session_data: SessionDataType) -> None:
     def predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = train_utils.batch_to_session_data(
-            batch_in, self.session_data_signature
-        )
+        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1388,6 +1307,3 @@ def predict(
             out["e_ids"] = pred_ids
 
         return out
-
-    def predict_dataset(self, session_data: SessionDataType) -> tf.data.Dataset:
-        return train_utils.create_tf_dataset(session_data, 1, label_key="label_ids")
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 7bf71d1b130f..a39a0b84b0e2 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -207,7 +207,7 @@ def preprocess_train_data(self, training_data):
             training_data, label_id_dict, attribute=RESPONSE_ATTRIBUTE
         )
 
-        session_data = self._create_session_data(
+        session_data = self._create_model_data(
             training_data.intent_examples,
             label_id_dict,
             label_attribute=RESPONSE_ATTRIBUTE,
diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tf_model_data.py
index 7e9b461acc95..25c85a2eb3e9 100644
--- a/rasa/utils/tf_model_data.py
+++ b/rasa/utils/tf_model_data.py
@@ -3,10 +3,13 @@
 import tensorflow as tf
 
 from sklearn.model_selection import train_test_split
-from typing import Optional, Dict, Text, List, Tuple, Any, Union, Generator
+from typing import Optional, Dict, Text, List, Tuple, Any, Union, Generator, NamedTuple
 from collections import defaultdict
 
-from utils import train_utils
+
+class DataSignature(NamedTuple):
+    is_sparse: bool
+    shape: List[int]
 
 
 class RasaModelData:
@@ -16,6 +19,12 @@ def __init__(self, data: Optional[Dict[Text, List[np.ndarray]]] = None):
         else:
             self.data = data
 
+    def get(self, key: Text) -> List[np.ndarray]:
+        return self.data[key]
+
+    def set(self, key: Text, value: List[np.ndarray]):
+        self.data[key] = value
+
     def items(self):
         return self.data.items()
 
@@ -84,7 +93,7 @@ def add_mask(self, key: Text, from_key: Text):
                 self.data[key].append(mask)
                 break
 
-    def get_signature(self) -> Dict[Text, Tuple[bool, Tuple[int]]]:
+    def get_signature(self) -> Dict[Text, List[DataSignature]]:
         """Get signature of RasaModelData.
 
         Signature stores the shape and whether features are sparse or not for every
@@ -92,7 +101,10 @@ def get_signature(self) -> Dict[Text, Tuple[bool, Tuple[int]]]:
 
         return {
             key: [
-                (True if isinstance(v[0], scipy.sparse.spmatrix) else False, v[0].shape)
+                DataSignature(
+                    True if isinstance(v[0], scipy.sparse.spmatrix) else False,
+                    v[0].shape,
+                )
                 for v in values
             ]
             for key, values in self.data.items()
@@ -105,7 +117,6 @@ def shuffle(self) -> None:
         ids = np.random.permutation(data_points)
         self.data = self._data_for_ids(ids)
 
-    # noinspection PyPep8Naming
     def balance(self, batch_size: int, shuffle: bool, label_key: Text) -> None:
         """Mix session data to account for class imbalance.
 
@@ -222,6 +233,164 @@ def convert_to_tf_dataset(
             args=([batch_size]),
         )
 
+    def prepare_batch(
+        self,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        tuple_sizes: Optional[Dict[Text, int]] = None,
+    ) -> Tuple[Optional[np.ndarray]]:
+        """Slices session data into batch using given start and end value."""
+
+        batch_data = []
+
+        for key, values in self.data.items():
+            # add None for not present values during processing
+            if not values:
+                if tuple_sizes:
+                    batch_data += [None] * tuple_sizes[key]
+                else:
+                    batch_data.append(None)
+                continue
+
+            for v in values:
+                if start is not None and end is not None:
+                    _data = v[start:end]
+                elif start is not None:
+                    _data = v[start:]
+                elif end is not None:
+                    _data = v[:end]
+                else:
+                    _data = v[:]
+
+                if isinstance(_data[0], scipy.sparse.spmatrix):
+                    batch_data.extend(self._scipy_matrix_to_values(_data))
+                else:
+                    batch_data.append(self._pad_dense_data(_data))
+
+        # len of batch_data is equal to the number of keys in session data
+        return tuple(batch_data)
+
+    def batch_tuple_sizes(self) -> Dict[Text, int]:
+
+        # save the amount of placeholders attributed to session data keys
+        tuple_sizes = defaultdict(int)
+
+        idx = 0
+        for k, values in self.data.items():
+            tuple_sizes[k] = 0
+            for v in values:
+                if isinstance(v[0], scipy.sparse.spmatrix):
+                    tuple_sizes[k] += 3
+                    idx += 3
+                else:
+                    tuple_sizes[k] += 1
+                    idx += 1
+
+        return tuple_sizes
+
+    def as_tf_dataset(
+        self,
+        batch_size: Union["tf.Tensor", int],
+        label_key: Text,
+        batch_strategy: Text = "sequence",
+        shuffle: bool = False,
+    ) -> "tf.data.Dataset":
+        """Create tf dataset."""
+
+        shapes, types = self._get_shapes_types()
+
+        return tf.data.Dataset.from_generator(
+            lambda batch_size_: self._gen_batch(
+                batch_size_, label_key, batch_strategy, shuffle
+            ),
+            output_types=types,
+            output_shapes=shapes,
+            args=([batch_size]),
+        )
+
+    def _get_shapes_types(self) -> Tuple:
+        """Extract shapes and types from session data."""
+
+        types = []
+        shapes = []
+
+        def append_shape(v: np.ndarray):
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                shapes.append((None, v[0].ndim + 1))
+                shapes.append((None,))
+                shapes.append((v[0].ndim + 1))
+            elif v[0].ndim == 0:
+                shapes.append((None,))
+            elif v[0].ndim == 1:
+                shapes.append((None, v[0].shape[-1]))
+            else:
+                shapes.append((None, None, v[0].shape[-1]))
+
+        def append_type(v: np.ndarray):
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                types.append(tf.int64)
+                types.append(tf.float32)
+                types.append(tf.int64)
+            else:
+                types.append(tf.float32)
+
+        for values in self.data.values():
+            for v in values:
+                append_shape(v)
+                append_type(v)
+
+        return tuple(shapes), tuple(types)
+
+    def _scipy_matrix_to_values(self, array_of_sparse: np.ndarray) -> List[np.ndarray]:
+        """Convert a scipy matrix into inidces, data, and shape."""
+
+        if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
+            array_of_sparse = [x.tocoo() for x in array_of_sparse]
+
+        max_seq_len = max([x.shape[0] for x in array_of_sparse])
+
+        indices = np.hstack(
+            [
+                np.vstack([i * np.ones_like(x.row), x.row, x.col])
+                for i, x in enumerate(array_of_sparse)
+            ]
+        ).T
+        data = np.hstack([x.data for x in array_of_sparse])
+
+        shape = np.array(
+            (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
+        )
+
+        return [
+            indices.astype(np.int64),
+            data.astype(np.float32),
+            shape.astype(np.int64),
+        ]
+
+    def _pad_dense_data(self, array_of_dense: np.ndarray) -> np.ndarray:
+        """Pad data of different lengths.
+
+        Sequential data is padded with zeros. Zeros are added to the end of data.
+        """
+
+        if array_of_dense[0].ndim < 2:
+            # data doesn't contain a sequence
+            return array_of_dense
+
+        data_size = len(array_of_dense)
+        max_seq_len = max([x.shape[0] for x in array_of_dense])
+
+        data_padded = np.zeros(
+            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
+            dtype=array_of_dense[0].dtype,
+        )
+        for i in range(data_size):
+            data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
+
+        return data_padded.astype(np.float32)
+
     def _get_shapes_types(self) -> Tuple:
         """Extract shapes and types from session data."""
 
@@ -279,7 +448,7 @@ def _gen_batch(
             start = batch_num * batch_size
             end = start + batch_size
 
-            yield train_utils.prepare_batch(self.data, start, end)
+            yield self.prepare_batch(start, end)
 
     def _check_train_test_sizes(
         self, number_of_test_examples: int, label_counts: Dict[Any, int]
@@ -300,7 +469,7 @@ def _check_train_test_sizes(
                 f"be at least equal to number of classes {label_counts}."
             )
 
-    def _data_for_ids(self, ids: np.ndarray):
+    def _data_for_ids(self, ids: np.ndarray) -> Dict[Text, List[np.ndarray]]:
         """Filter session data by ids."""
 
         new_data = defaultdict(list)
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index d12055f55da6..3b58e7d682f4 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -6,7 +6,7 @@
 from rasa.utils.common import is_logging_disabled
 import tensorflow as tf
 
-from rasa.utils.train_utils import SessionDataType
+from rasa.utils.tf_model_data import RasaModelData
 
 logger = logging.getLogger(__name__)
 
@@ -24,14 +24,16 @@ def _update_postfix_dict(
 
     def fit(
         self,
+        model_data: RasaModelData,
         epochs: int,
         batch_size: Union[List[int], int],
-        session_data: SessionDataType,
-        eval_session_data: Optional[SessionDataType],
         evaluate_on_num_examples: int,
         evaluate_every_num_epochs: int,
+        label_key: Text,
+        batch_strategy: Text,
         silent: bool = False,
         eager: bool = False,
+        random_seed: int = 42,
         **kwargs,
     ) -> None:
         """Train tf graph"""
@@ -41,41 +43,51 @@ def fit(
                 f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
                 f"epochs."
             )
+
+            model_data, evaluation_model_data = model_data.split(
+                evaluate_on_num_examples, random_seed, label_key="label_ids"
+            )
+
         disable = silent or is_logging_disabled()
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
         tf_batch_size = tf.ones((), tf.int32)
 
         def train_dataset_function(x):
-            return self.train_dataset(x, session_data)
+            return model_data.as_tf_dataset(x, label_key, batch_strategy, shuffle=True)
 
-        def eval_dataset_function(x):
-            return self.eval_dataset(x, eval_session_data)
+        def evaluation_dataset_function(x):
+            return evaluation_model_data.as_tf_dataset(
+                x, label_key, batch_strategy, shuffle=False
+            )
 
         if eager:
             # allows increasing batch size
-            train_dataset_func = train_dataset_function
-            train_on_batch_func = self.train_on_batch
+            tf_train_dataset_function = train_dataset_function
+            tf_train_on_batch_function = self.train_on_batch
         else:
             # allows increasing batch size
-            train_dataset_func = tf.function(func=train_dataset_function)
-            train_on_batch_func = tf.function(
+            tf_train_dataset_function = tf.function(func=train_dataset_function)
+            tf_train_on_batch_function = tf.function(
                 self.train_on_batch,
-                input_signature=[train_dataset_func(1).element_spec],
+                input_signature=[tf_train_dataset_function(1).element_spec],
             )
 
         if evaluate_on_num_examples > 0:
             if eager:
-                eval_dataset_func = eval_dataset_function
-                eval_func = self.eval
+                tf_evaluation_dataset_function = evaluation_dataset_function
+                tf_evaluation_function = self.eval
             else:
-                eval_dataset_func = tf.function(func=eval_dataset_function)
-                eval_func = tf.function(
-                    self.eval, input_signature=[eval_dataset_func(1).element_spec]
+                tf_evaluation_dataset_function = tf.function(
+                    func=evaluation_dataset_function
+                )
+                tf_evaluation_function = tf.function(
+                    self.eval,
+                    input_signature=[tf_evaluation_dataset_function(1).element_spec],
                 )
         else:
-            eval_dataset_func = None
-            eval_func = None
+            tf_evaluation_dataset_function = None
+            tf_evaluation_function = None
 
         for ep in pbar:
             ep_batch_size = tf_batch_size * train_utils.linearly_increasing_batch_size(
@@ -88,11 +100,8 @@ def eval_dataset_function(x):
 
             # Train on batches
             self.set_training_phase(True)
-            for batch_in in train_dataset_func(ep_batch_size):
-                train_on_batch_func(batch_in)
-
-            # print(self.metrics)
-            # exit()
+            for batch_in in tf_train_dataset_function(ep_batch_size):
+                tf_train_on_batch_function(batch_in)
 
             # Get the metric results
             postfix_dict = {
@@ -111,8 +120,8 @@ def eval_dataset_function(x):
 
                     # Eval on batches
                     self.set_training_phase(False)
-                    for batch_in in eval_dataset_func(ep_batch_size):
-                        eval_func(batch_in)
+                    for batch_in in tf_evaluation_dataset_function(ep_batch_size):
+                        tf_evaluation_function(batch_in)
 
                 # Get the metric results
                 postfix_dict.update(
@@ -121,7 +130,6 @@ def eval_dataset_function(x):
 
             pbar.set_postfix(postfix_dict)
 
-            # _write_training_metrics(output_file, ep, train_metrics, val_metrics)
         if not disable:
             logger.info("Finished training.")
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index dc0673480dda..e10a9d44fc61 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -7,6 +7,7 @@
 from sklearn.model_selection import train_test_split
 import tensorflow as tf
 
+from rasa.utils.tf_model_data import DataSignature
 
 logger = logging.getLogger(__name__)
 
@@ -401,7 +402,7 @@ def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
 
 def batch_to_session_data(
     batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
-    session_data_signature: SessionDataSignature,
+    data_signature: Dict[Text, List[DataSignature]],
 ) -> Dict[Text, List[tf.Tensor]]:
     """Convert input batch tensors into batch data format.
 
@@ -414,7 +415,7 @@ def batch_to_session_data(
     batch_data = defaultdict(list)
 
     idx = 0
-    for k, signature in session_data_signature.items():
+    for k, signature in data_signature.items():
         for is_sparse, shape in signature:
             if is_sparse:
                 # explicitly substitute last dimension in shape with known static value
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
new file mode 100644
index 000000000000..6708c0f89d5a
--- /dev/null
+++ b/tests/utils/test_tf_model_data.py
@@ -0,0 +1,148 @@
+import pytest
+import scipy.sparse
+import numpy as np
+
+from rasa.utils.tf_model_data import RasaModelData
+
+
+@pytest.fixture
+async def model_data() -> RasaModelData:
+    return RasaModelData(
+        {
+            "text_features": [
+                np.array(
+                    [
+                        np.random.rand(5, 14),
+                        np.random.rand(2, 14),
+                        np.random.rand(3, 14),
+                        np.random.rand(1, 14),
+                        np.random.rand(3, 14),
+                    ]
+                ),
+                np.array(
+                    [
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                    ]
+                ),
+            ],
+            "intent_features": [
+                np.array(
+                    [
+                        np.random.randint(2, size=(5, 10)),
+                        np.random.randint(2, size=(2, 10)),
+                        np.random.randint(2, size=(3, 10)),
+                        np.random.randint(2, size=(1, 10)),
+                        np.random.randint(2, size=(3, 10)),
+                    ]
+                )
+            ],
+            "intent_ids": [np.array([0, 1, 0, 1, 1])],
+            "tag_ids": [
+                np.array(
+                    [
+                        np.array([0, 1, 1, 0, 2]),
+                        np.array([2, 0]),
+                        np.array([0, 1, 1]),
+                        np.array([0, 1]),
+                        np.array([0, 0, 0]),
+                    ]
+                )
+            ],
+        }
+    )
+
+
+def test_shuffle_session_data(model_data: RasaModelData):
+    before = model_data.values()
+
+    model_data.shuffle()
+
+    assert np.array(before) != np.array(model_data.values())
+
+
+def test_split_session_data_by_label(model_data: RasaModelData):
+    split_model_data = model_data._split_by_label_ids(
+        model_data.get("intent_ids")[0], np.array([0, 1])
+    )
+
+    assert len(split_model_data) == 2
+    for s in split_model_data:
+        assert len(set(s.get("intent_ids")[0])) == 1
+
+
+def test_train_val_split(model_data: RasaModelData):
+    train_model_data, test_model_data = model_data.split(2, 42, "intent_ids")
+
+    for k, values in model_data.items():
+        assert len(values) == len(train_model_data.get(k))
+        assert len(values) == len(test_model_data.get(k))
+        for i, v in enumerate(values):
+            assert v[0].dtype == train_model_data.get(k)[i][0].dtype
+
+    for values in train_model_data.values():
+        for v in values:
+            assert v.shape[0] == 3
+
+    for values in test_model_data.values():
+        for v in values:
+            assert v.shape[0] == 2
+
+
+@pytest.mark.parametrize("size", [0, 1, 5])
+def test_train_val_split_incorrect_size(model_data: RasaModelData, size: int):
+    with pytest.raises(ValueError):
+        model_data.split(size, 42, "intent_ids")
+
+
+def test_session_data_for_ids(model_data: RasaModelData):
+    filtered_data = model_data._data_for_ids(np.array([0, 1]))
+
+    for values in filtered_data.values():
+        for v in values:
+            assert v.shape[0] == 2
+
+    k = list(model_data.keys())[0]
+
+    assert np.all(np.array(filtered_data[k][0][0]) == np.array(model_data.get(k)[0][0]))
+    assert np.all(np.array(filtered_data[k][0][1]) == np.array(model_data.get(k)[0][1]))
+
+
+def test_get_number_of_examples(model_data: RasaModelData):
+    assert model_data.get_number_of_examples() == 5
+
+
+def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
+    model_data.set("dense", [np.random.randint(5, size=(2, 10))])
+    with pytest.raises(ValueError):
+        model_data.get_number_of_examples()
+
+
+def test_gen_batch(model_data: RasaModelData):
+    iterator = model_data._gen_batch(
+        2, "intent_ids", shuffle=True, batch_strategy="balanced"
+    )
+
+    batch = next(iterator)
+    assert len(batch) == 7
+    assert len(batch[0]) == 2
+
+    batch = next(iterator)
+    assert len(batch) == 7
+    assert len(batch[0]) == 2
+
+    batch = next(iterator)
+    assert len(batch) == 7
+    assert len(batch[0]) == 1
+
+    with pytest.raises(StopIteration):
+        next(iterator)
+
+
+def test_balance_session_data(model_data: RasaModelData):
+    model_data.balance(2, False, "intent_ids")
+
+    assert np.all(model_data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index bdbb89cde492..8bbfb63d8b2b 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -15,7 +15,7 @@
 
 
 @pytest.fixture
-async def session_data() -> SessionDataType:
+async def model_data() -> SessionDataType:
     return {
         "text_features": [
             np.array(
@@ -63,15 +63,15 @@ async def session_data() -> SessionDataType:
     }
 
 
-def test_shuffle_session_data(session_data: SessionDataType):
-    shuffeled_session_data = shuffle_session_data(session_data)
+def test_shuffle_session_data(model_data):
+    shuffeled_session_data = shuffle_session_data(model_data)
 
-    assert np.array(shuffeled_session_data.values()) != np.array(session_data.values())
+    assert np.array(shuffeled_session_data.values()) != np.array(model_data.values())
 
 
-def test_split_session_data_by_label(session_data: SessionDataType):
+def test_split_session_data_by_label(model_data):
     split_session_data = split_session_data_by_label_ids(
-        session_data, session_data["intent_ids"][0], np.array([0, 1])
+        model_data, model_data["intent_ids"][0], np.array([0, 1])
     )
 
     assert len(split_session_data) == 2
@@ -79,12 +79,12 @@ def test_split_session_data_by_label(session_data: SessionDataType):
         assert len(set(s["intent_ids"][0])) == 1
 
 
-def test_train_val_split(session_data: SessionDataType):
+def test_train_val_split(model_data):
     train_session_data, val_session_data = train_val_split(
-        session_data, 2, 42, "intent_ids"
+        model_data, 2, 42, "intent_ids"
     )
 
-    for k, values in session_data.items():
+    for k, values in model_data.items():
         assert len(values) == len(train_session_data[k])
         assert len(values) == len(val_session_data[k])
         for i, v in enumerate(values):
@@ -100,43 +100,43 @@ def test_train_val_split(session_data: SessionDataType):
 
 
 @pytest.mark.parametrize("size", [0, 1, 5])
-def test_train_val_split_incorrect_size(session_data: SessionDataType, size):
+def test_train_val_split_incorrect_size(model_data, size):
     with pytest.raises(ValueError):
-        train_val_split(session_data, size, 42, "intent_ids")
+        train_val_split(model_data, size, 42, "intent_ids")
 
 
-def test_session_data_for_ids(session_data: SessionDataType):
-    filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
+def test_session_data_for_ids(model_data):
+    filtered_session_data = session_data_for_ids(model_data, np.array([0, 1]))
 
     for values in filtered_session_data.values():
         for v in values:
             assert v.shape[0] == 2
 
-    k = list(session_data.keys())[0]
+    k = list(model_data.keys())[0]
 
     assert np.all(
-        np.array(filtered_session_data[k][0][0]) == np.array(session_data[k][0][0])
+        np.array(filtered_session_data[k][0][0]) == np.array(model_data[k][0][0])
     )
     assert np.all(
-        np.array(filtered_session_data[k][0][1]) == np.array(session_data[k][0][1])
+        np.array(filtered_session_data[k][0][1]) == np.array(model_data[k][0][1])
     )
 
 
-def test_get_number_of_examples(session_data: SessionDataType):
-    num = get_number_of_examples(session_data)
+def test_get_number_of_examples(model_data):
+    num = get_number_of_examples(model_data)
 
     assert num == 5
 
 
-def test_get_number_of_examples_raises_value_error(session_data: SessionDataType):
-    session_data["dense"] = np.random.randint(5, size=(2, 10))
+def test_get_number_of_examples_raises_value_error(model_data):
+    model_data["dense"] = np.random.randint(5, size=(2, 10))
     with pytest.raises(ValueError):
-        get_number_of_examples(session_data)
+        get_number_of_examples(model_data)
 
 
-def test_gen_batch(session_data: SessionDataType):
+def test_gen_batch(model_data):
     iterator = gen_batch(
-        session_data, 2, "intent_ids", shuffle=True, batch_strategy="balanced"
+        model_data, 2, "intent_ids", shuffle=True, batch_strategy="balanced"
     )
 
     batch = next(iterator)
@@ -155,10 +155,10 @@ def test_gen_batch(session_data: SessionDataType):
         next(iterator)
 
 
-def test_balance_session_data(session_data: SessionDataType):
-    balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
+def test_balance_session_data(model_data):
+    balanced_session_data = balance_session_data(model_data, 2, False, "intent_ids")
 
-    for k, values in session_data.items():
+    for k, values in model_data.items():
         assert k in balanced_session_data
 
         for i, v in enumerate(values):

From 9cdffda423cb8a39e490eb2fbde85f4ec60e305a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 14:28:18 +0100
Subject: [PATCH 141/633] remove methods from train_utils

---
 rasa/utils/tf_model_data.py     | 153 ++++------
 rasa/utils/train_utils.py       | 490 +-------------------------------
 tests/utils/test_train_utils.py | 167 -----------
 3 files changed, 61 insertions(+), 749 deletions(-)
 delete mode 100644 tests/utils/test_train_utils.py

diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tf_model_data.py
index 25c85a2eb3e9..5924e1ca7eab 100644
--- a/rasa/utils/tf_model_data.py
+++ b/rasa/utils/tf_model_data.py
@@ -314,104 +314,21 @@ def _get_shapes_types(self) -> Tuple:
         types = []
         shapes = []
 
-        def append_shape(v: np.ndarray):
-            if isinstance(v[0], scipy.sparse.spmatrix):
+        def append_shape(features: np.ndarray):
+            if isinstance(features[0], scipy.sparse.spmatrix):
                 # scipy matrix is converted into indices, data, shape
-                shapes.append((None, v[0].ndim + 1))
+                shapes.append((None, features[0].ndim + 1))
                 shapes.append((None,))
-                shapes.append((v[0].ndim + 1))
-            elif v[0].ndim == 0:
+                shapes.append((features[0].ndim + 1))
+            elif features[0].ndim == 0:
                 shapes.append((None,))
-            elif v[0].ndim == 1:
-                shapes.append((None, v[0].shape[-1]))
+            elif features[0].ndim == 1:
+                shapes.append((None, features[0].shape[-1]))
             else:
-                shapes.append((None, None, v[0].shape[-1]))
+                shapes.append((None, None, features[0].shape[-1]))
 
-        def append_type(v: np.ndarray):
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                # scipy matrix is converted into indices, data, shape
-                types.append(tf.int64)
-                types.append(tf.float32)
-                types.append(tf.int64)
-            else:
-                types.append(tf.float32)
-
-        for values in self.data.values():
-            for v in values:
-                append_shape(v)
-                append_type(v)
-
-        return tuple(shapes), tuple(types)
-
-    def _scipy_matrix_to_values(self, array_of_sparse: np.ndarray) -> List[np.ndarray]:
-        """Convert a scipy matrix into inidces, data, and shape."""
-
-        if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
-            array_of_sparse = [x.tocoo() for x in array_of_sparse]
-
-        max_seq_len = max([x.shape[0] for x in array_of_sparse])
-
-        indices = np.hstack(
-            [
-                np.vstack([i * np.ones_like(x.row), x.row, x.col])
-                for i, x in enumerate(array_of_sparse)
-            ]
-        ).T
-        data = np.hstack([x.data for x in array_of_sparse])
-
-        shape = np.array(
-            (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
-        )
-
-        return [
-            indices.astype(np.int64),
-            data.astype(np.float32),
-            shape.astype(np.int64),
-        ]
-
-    def _pad_dense_data(self, array_of_dense: np.ndarray) -> np.ndarray:
-        """Pad data of different lengths.
-
-        Sequential data is padded with zeros. Zeros are added to the end of data.
-        """
-
-        if array_of_dense[0].ndim < 2:
-            # data doesn't contain a sequence
-            return array_of_dense
-
-        data_size = len(array_of_dense)
-        max_seq_len = max([x.shape[0] for x in array_of_dense])
-
-        data_padded = np.zeros(
-            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
-            dtype=array_of_dense[0].dtype,
-        )
-        for i in range(data_size):
-            data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
-
-        return data_padded.astype(np.float32)
-
-    def _get_shapes_types(self) -> Tuple:
-        """Extract shapes and types from session data."""
-
-        types = []
-        shapes = []
-
-        def append_shape(v: np.ndarray):
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                # scipy matrix is converted into indices, data, shape
-                shapes.append((None, v[0].ndim + 1))
-                shapes.append((None,))
-                shapes.append((v[0].ndim + 1))
-            elif v[0].ndim == 0:
-                shapes.append((None,))
-            elif v[0].ndim == 1:
-                shapes.append((None, v[0].shape[-1]))
-            else:
-                shapes.append((None, None, v[0].shape[-1]))
-
-        def append_type(v: np.ndarray):
-            if isinstance(v[0], scipy.sparse.spmatrix):
+        def append_type(features: np.ndarray):
+            if isinstance(features[0], scipy.sparse.spmatrix):
                 # scipy matrix is converted into indices, data, shape
                 types.append(tf.int64)
                 types.append(tf.float32)
@@ -563,3 +480,53 @@ def _create_label_ids(label_ids: np.ndarray) -> np.ndarray:
             return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
 
         raise ValueError("Unsupported label_ids dimensions")
+
+    @staticmethod
+    def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
+        """Pad data of different lengths.
+
+        Sequential data is padded with zeros. Zeros are added to the end of data.
+        """
+
+        if array_of_dense[0].ndim < 2:
+            # data doesn't contain a sequence
+            return array_of_dense
+
+        data_size = len(array_of_dense)
+        max_seq_len = max([x.shape[0] for x in array_of_dense])
+
+        data_padded = np.zeros(
+            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
+            dtype=array_of_dense[0].dtype,
+        )
+        for i in range(data_size):
+            data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
+
+        return data_padded.astype(np.float32)
+
+    @staticmethod
+    def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
+        """Convert a scipy matrix into inidces, data, and shape."""
+
+        if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
+            array_of_sparse = [x.tocoo() for x in array_of_sparse]
+
+        max_seq_len = max([x.shape[0] for x in array_of_sparse])
+
+        indices = np.hstack(
+            [
+                np.vstack([i * np.ones_like(x.row), x.row, x.col])
+                for i, x in enumerate(array_of_sparse)
+            ]
+        ).T
+        data = np.hstack([x.data for x in array_of_sparse])
+
+        shape = np.array(
+            (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
+        )
+
+        return [
+            indices.astype(np.int64),
+            data.astype(np.float32),
+            shape.astype(np.int64),
+        ]
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e10a9d44fc61..6c32e5fdd459 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,10 +1,7 @@
 from collections import defaultdict
 import logging
-import scipy.sparse
-import typing
-from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Any, NamedTuple
+from typing import List, Optional, Text, Dict, Tuple, Union, Any, NamedTuple
 import numpy as np
-from sklearn.model_selection import train_test_split
 import tensorflow as tf
 
 from rasa.utils.tf_model_data import DataSignature
@@ -12,14 +9,6 @@
 logger = logging.getLogger(__name__)
 
 
-# type for all tf session related data
-SessionDataType = Dict[Text, List[np.ndarray]]
-# signature for all session related data
-# (boolean indicates whether data are sparse or not)
-# (list values represent the shape)
-SessionDataSignature = Dict[Text, List[Tuple[bool, List[int]]]]
-
-
 # namedtuple for training metrics
 class TrainingMetrics(NamedTuple):
     loss: Dict[Text, Union[tf.Tensor, float]]
@@ -35,371 +24,6 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-def create_label_ids(label_ids: "np.ndarray") -> "np.ndarray":
-    """Convert various size label_ids into single dim array.
-
-    for multi-label y, map each distinct row to a string repr
-    using join because str(row) uses an ellipsis if len(row) > 1000.
-    Idea taken from sklearn's stratify split.
-    """
-
-    if label_ids.ndim == 1:
-        return label_ids
-
-    if label_ids.ndim == 2 and label_ids.shape[-1] == 1:
-        return label_ids[:, 0]
-
-    if label_ids.ndim == 2:
-        return np.array([" ".join(row.astype("str")) for row in label_ids])
-
-    if label_ids.ndim == 3 and label_ids.shape[-1] == 1:
-        return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
-
-    raise ValueError("Unsupported label_ids dimensions")
-
-
-# noinspection PyPep8Naming
-def train_val_split(
-    session_data: SessionDataType,
-    evaluate_on_num_examples: int,
-    random_seed: int,
-    label_key: Text,
-) -> Tuple[SessionDataType, SessionDataType]:
-    """Create random hold out validation set using stratified split."""
-
-    if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.")
-
-    label_ids = create_label_ids(session_data[label_key][0])
-
-    label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
-
-    check_train_test_sizes(evaluate_on_num_examples, label_counts, session_data)
-
-    counts = np.array([label_counts[label] for label in label_ids])
-
-    multi_values = [v[counts > 1] for values in session_data.values() for v in values]
-
-    solo_values = [v[counts == 1] for values in session_data.values() for v in values]
-
-    output_values = train_test_split(
-        *multi_values,
-        test_size=evaluate_on_num_examples,
-        random_state=random_seed,
-        stratify=label_ids[counts > 1],
-    )
-
-    session_data_train, session_data_val = convert_train_test_split(
-        output_values, session_data, solo_values
-    )
-
-    return session_data_train, session_data_val
-
-
-def check_train_test_sizes(
-    evaluate_on_num_examples: int,
-    label_counts: Dict[Any, int],
-    session_data: SessionDataType,
-):
-    """Check whether the evaluation data set is too large or too small."""
-
-    num_examples = get_number_of_examples(session_data)
-
-    if evaluate_on_num_examples >= num_examples - len(label_counts):
-        raise ValueError(
-            f"Validation set of {evaluate_on_num_examples} is too large. Remaining "
-            f"train set should be at least equal to number of classes "
-            f"{len(label_counts)}."
-        )
-    elif evaluate_on_num_examples < len(label_counts):
-        raise ValueError(
-            f"Validation set of {evaluate_on_num_examples} is too small. It should be "
-            "at least equal to number of classes {label_counts}."
-        )
-
-
-def convert_train_test_split(
-    output_values: List[Any], session_data: SessionDataType, solo_values: List[Any]
-) -> Tuple[SessionDataType, SessionDataType]:
-    """Convert the output of sklearn.model_selection.train_test_split into train and
-    eval session data."""
-
-    session_data_train = defaultdict(list)
-    session_data_val = defaultdict(list)
-
-    # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
-    # order is kept, e.g. same order as session data keys
-
-    # train datasets have an even index
-    index = 0
-    for key, values in session_data.items():
-        for _ in range(len(values)):
-            session_data_train[key].append(
-                combine_features(output_values[index * 2], solo_values[index])
-            )
-            index += 1
-
-    # val datasets have an odd index
-    index = 0
-    for key, values in session_data.items():
-        for _ in range(len(values)):
-            session_data_val[key].append(output_values[(index * 2) + 1])
-            index += 1
-
-    return session_data_train, session_data_val
-
-
-def combine_features(
-    feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
-    feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
-) -> Union[np.ndarray, scipy.sparse.spmatrix]:
-    """Concatenate features."""
-
-    if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
-        feature_2, scipy.sparse.spmatrix
-    ):
-        if feature_2.shape[0] == 0:
-            return feature_1
-        if feature_1.shape[0] == 0:
-            return feature_2
-        return scipy.sparse.vstack([feature_1, feature_2])
-
-    return np.concatenate([feature_1, feature_2])
-
-
-def shuffle_session_data(session_data: SessionDataType) -> SessionDataType:
-    """Shuffle session data."""
-
-    data_points = get_number_of_examples(session_data)
-    ids = np.random.permutation(data_points)
-    return session_data_for_ids(session_data, ids)
-
-
-def session_data_for_ids(session_data: SessionDataType, ids: np.ndarray):
-    """Filter session data by ids."""
-
-    new_session_data = defaultdict(list)
-    for k, values in session_data.items():
-        for v in values:
-            new_session_data[k].append(v[ids])
-    return new_session_data
-
-
-def split_session_data_by_label_ids(
-    session_data: SessionDataType,
-    label_ids: "np.ndarray",
-    unique_label_ids: "np.ndarray",
-) -> List[SessionDataType]:
-    """Reorganize session data into a list of session data with the same labels."""
-
-    label_data = []
-    for label_id in unique_label_ids:
-        ids = label_ids == label_id
-        label_data.append(session_data_for_ids(session_data, ids))
-    return label_data
-
-
-# noinspection PyPep8Naming
-def balance_session_data(
-    session_data: SessionDataType, batch_size: int, shuffle: bool, label_key: Text
-) -> SessionDataType:
-    """Mix session data to account for class imbalance.
-
-    This batching strategy puts rare classes approximately in every other batch,
-    by repeating them. Mimics stratified batching, but also takes into account
-    that more populated classes should appear more often.
-    """
-
-    if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
-
-    label_ids = create_label_ids(session_data[label_key][0])
-
-    unique_label_ids, counts_label_ids = np.unique(
-        label_ids, return_counts=True, axis=0
-    )
-    num_label_ids = len(unique_label_ids)
-
-    # need to call every time, so that the data is shuffled inside each class
-    label_data = split_session_data_by_label_ids(
-        session_data, label_ids, unique_label_ids
-    )
-
-    data_idx = [0] * num_label_ids
-    num_data_cycles = [0] * num_label_ids
-    skipped = [False] * num_label_ids
-
-    new_session_data = defaultdict(list)
-    num_examples = get_number_of_examples(session_data)
-
-    while min(num_data_cycles) == 0:
-        if shuffle:
-            indices_of_labels = np.random.permutation(num_label_ids)
-        else:
-            indices_of_labels = range(num_label_ids)
-
-        for index in indices_of_labels:
-            if num_data_cycles[index] > 0 and not skipped[index]:
-                skipped[index] = True
-                continue
-            else:
-                skipped[index] = False
-
-            index_batch_size = (
-                int(counts_label_ids[index] / num_examples * batch_size) + 1
-            )
-
-            for k, values in label_data[index].items():
-                for i, v in enumerate(values):
-                    if len(new_session_data[k]) < i + 1:
-                        new_session_data[k].append([])
-                    new_session_data[k][i].append(
-                        v[data_idx[index] : data_idx[index] + index_batch_size]
-                    )
-
-            data_idx[index] += index_batch_size
-            if data_idx[index] >= counts_label_ids[index]:
-                num_data_cycles[index] += 1
-                data_idx[index] = 0
-
-            if min(num_data_cycles) > 0:
-                break
-
-    final_session_data = defaultdict(list)
-    for k, values in new_session_data.items():
-        for v in values:
-            final_session_data[k].append(np.concatenate(np.array(v)))
-
-    return final_session_data
-
-
-def get_number_of_examples(session_data: SessionDataType) -> int:
-    """Obtain number of examples in session data.
-
-    Raise a ValueError if number of examples differ for different data in session data.
-    """
-
-    example_lengths = [v.shape[0] for values in session_data.values() for v in values]
-
-    # check if number of examples is the same for all values
-    if not all(length == example_lengths[0] for length in example_lengths):
-        raise ValueError(
-            f"Number of examples differs for keys '{session_data.keys()}'. Number of "
-            f"examples should be the same for all data in session data."
-        )
-
-    return example_lengths[0]
-
-
-def gen_batch(
-    session_data: SessionDataType,
-    batch_size: int,
-    label_key: Text,
-    batch_strategy: Text = "sequence",
-    shuffle: bool = False,
-) -> Generator[Tuple, None, None]:
-    """Generate batches."""
-
-    if shuffle:
-        session_data = shuffle_session_data(session_data)
-
-    if batch_strategy == "balanced":
-        session_data = balance_session_data(
-            session_data, batch_size, shuffle, label_key
-        )
-
-    num_examples = get_number_of_examples(session_data)
-    num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
-
-    for batch_num in range(num_batches):
-        start = batch_num * batch_size
-        end = start + batch_size
-
-        yield prepare_batch(session_data, start, end)
-
-
-def prepare_batch(
-    session_data: SessionDataType,
-    start: Optional[int] = None,
-    end: Optional[int] = None,
-    tuple_sizes: Optional[Dict[Text, int]] = None,
-) -> Tuple[Optional[np.ndarray]]:
-    """Slices session data into batch using given start and end value."""
-
-    batch_data = []
-
-    for key, values in session_data.items():
-        # add None for not present values during processing
-        if not values:
-            if tuple_sizes:
-                batch_data += [None] * tuple_sizes[key]
-            else:
-                batch_data.append(None)
-            continue
-
-        for v in values:
-            if start is not None and end is not None:
-                _data = v[start:end]
-            elif start is not None:
-                _data = v[start:]
-            elif end is not None:
-                _data = v[:end]
-            else:
-                _data = v[:]
-
-            if isinstance(_data[0], scipy.sparse.spmatrix):
-                batch_data.extend(scipy_matrix_to_values(_data))
-            else:
-                batch_data.append(pad_dense_data(_data))
-
-    # len of batch_data is equal to the number of keys in session data
-    return tuple(batch_data)
-
-
-def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
-    """Convert a scipy matrix into inidces, data, and shape."""
-
-    if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
-        array_of_sparse = [x.tocoo() for x in array_of_sparse]
-
-    max_seq_len = max([x.shape[0] for x in array_of_sparse])
-
-    indices = np.hstack(
-        [
-            np.vstack([i * np.ones_like(x.row), x.row, x.col])
-            for i, x in enumerate(array_of_sparse)
-        ]
-    ).T
-    data = np.hstack([x.data for x in array_of_sparse])
-
-    shape = np.array((len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1]))
-
-    return [indices.astype(np.int64), data.astype(np.float32), shape.astype(np.int64)]
-
-
-def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
-    """Pad data of different lengths.
-
-    Sequential data is padded with zeros. Zeros are added to the end of data.
-    """
-
-    if array_of_dense[0].ndim < 2:
-        # data doesn't contain a sequence
-        return array_of_dense
-
-    data_size = len(array_of_dense)
-    max_seq_len = max([x.shape[0] for x in array_of_dense])
-
-    data_padded = np.zeros(
-        [data_size, max_seq_len, array_of_dense[0].shape[-1]],
-        dtype=array_of_dense[0].dtype,
-    )
-    for i in range(data_size):
-        data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
-
-    return data_padded.astype(np.float32)
-
-
 def batch_to_session_data(
     batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
     data_signature: Dict[Text, List[DataSignature]],
@@ -434,82 +58,6 @@ def batch_to_session_data(
     return batch_data
 
 
-def batch_tuple_sizes(session_data: SessionDataType) -> Dict[Text, int]:
-
-    # save the amount of placeholders attributed to session data keys
-    tuple_sizes = defaultdict(int)
-
-    idx = 0
-    for k, values in session_data.items():
-        tuple_sizes[k] = 0
-        for v in values:
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                tuple_sizes[k] += 3
-                idx += 3
-            else:
-                tuple_sizes[k] += 1
-                idx += 1
-
-    return tuple_sizes
-
-
-def create_tf_dataset(
-    session_data: SessionDataType,
-    batch_size: Union["tf.Tensor", int],
-    label_key: Text,
-    batch_strategy: Text = "sequence",
-    shuffle: bool = False,
-) -> "tf.data.Dataset":
-    """Create tf dataset."""
-
-    shapes, types = get_shapes_types(session_data)
-
-    return tf.data.Dataset.from_generator(
-        lambda batch_size_: gen_batch(
-            session_data, batch_size_, label_key, batch_strategy, shuffle
-        ),
-        output_types=types,
-        output_shapes=shapes,
-        args=([batch_size]),
-    )
-
-
-def get_shapes_types(session_data: SessionDataType) -> Tuple:
-    """Extract shapes and types from session data."""
-
-    types = []
-    shapes = []
-
-    def append_shape(v: np.ndarray):
-        if isinstance(v[0], scipy.sparse.spmatrix):
-            # scipy matrix is converted into indices, data, shape
-            shapes.append((None, v[0].ndim + 1))
-            shapes.append((None,))
-            shapes.append((v[0].ndim + 1))
-        elif v[0].ndim == 0:
-            shapes.append((None,))
-        elif v[0].ndim == 1:
-            shapes.append((None, v[0].shape[-1]))
-        else:
-            shapes.append((None, None, v[0].shape[-1]))
-
-    def append_type(v: np.ndarray):
-        if isinstance(v[0], scipy.sparse.spmatrix):
-            # scipy matrix is converted into indices, data, shape
-            types.append(tf.int64)
-            types.append(tf.float32)
-            types.append(tf.int64)
-        else:
-            types.append(tf.float32)
-
-    for values in session_data.values():
-        for v in values:
-            append_shape(v)
-            append_type(v)
-
-    return tuple(shapes), tuple(types)
-
-
 def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
     if similarity_type == "cosine":
         # clip negative values to zero
@@ -538,42 +86,6 @@ def linearly_increasing_batch_size(
         return int(batch_size[0])
 
 
-def output_validation_stat(
-    eval_init_op: "tf.Operation",
-    metrics: TrainingMetrics,
-    session: "tf.Session",
-    is_training: "tf.Session",
-    batch_size_in: "tf.Tensor",
-    ep_batch_size: int,
-) -> TrainingMetrics:
-    """Output training statistics"""
-
-    session.run(eval_init_op, feed_dict={batch_size_in: ep_batch_size})
-    ep_val_metrics = TrainingMetrics(
-        loss=defaultdict(lambda: 0.0), score=defaultdict(lambda: 0.0)
-    )
-    batches_per_epoch = 0
-    while True:
-        try:
-            batch_val_metrics = session.run([metrics], feed_dict={is_training: False})
-            batch_val_metrics = batch_val_metrics[0]
-            batches_per_epoch += 1
-            for name, value in batch_val_metrics.loss.items():
-                ep_val_metrics.loss[name] += value
-            for name, value in batch_val_metrics.score.items():
-                ep_val_metrics.score[name] += value
-
-        except tf.errors.OutOfRangeError:
-            break
-
-    for name, value in ep_val_metrics.loss.items():
-        ep_val_metrics.loss[name] = value / batches_per_epoch
-    for name, value in ep_val_metrics.score.items():
-        ep_val_metrics.score[name] = value / batches_per_epoch
-
-    return ep_val_metrics
-
-
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
     """Extract attention probabilities from t2t dict"""
 
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
deleted file mode 100644
index 8bbfb63d8b2b..000000000000
--- a/tests/utils/test_train_utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import pytest
-import scipy.sparse
-import numpy as np
-
-from rasa.utils.train_utils import (
-    SessionDataType,
-    shuffle_session_data,
-    split_session_data_by_label_ids,
-    train_val_split,
-    session_data_for_ids,
-    get_number_of_examples,
-    gen_batch,
-    balance_session_data,
-)
-
-
-@pytest.fixture
-async def model_data() -> SessionDataType:
-    return {
-        "text_features": [
-            np.array(
-                [
-                    np.random.rand(5, 14),
-                    np.random.rand(2, 14),
-                    np.random.rand(3, 14),
-                    np.random.rand(1, 14),
-                    np.random.rand(3, 14),
-                ]
-            ),
-            np.array(
-                [
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                ]
-            ),
-        ],
-        "intent_features": [
-            np.array(
-                [
-                    np.random.randint(2, size=(5, 10)),
-                    np.random.randint(2, size=(2, 10)),
-                    np.random.randint(2, size=(3, 10)),
-                    np.random.randint(2, size=(1, 10)),
-                    np.random.randint(2, size=(3, 10)),
-                ]
-            )
-        ],
-        "intent_ids": [np.array([0, 1, 0, 1, 1])],
-        "tag_ids": [
-            np.array(
-                [
-                    np.array([0, 1, 1, 0, 2]),
-                    np.array([2, 0]),
-                    np.array([0, 1, 1]),
-                    np.array([0, 1]),
-                    np.array([0, 0, 0]),
-                ]
-            )
-        ],
-    }
-
-
-def test_shuffle_session_data(model_data):
-    shuffeled_session_data = shuffle_session_data(model_data)
-
-    assert np.array(shuffeled_session_data.values()) != np.array(model_data.values())
-
-
-def test_split_session_data_by_label(model_data):
-    split_session_data = split_session_data_by_label_ids(
-        model_data, model_data["intent_ids"][0], np.array([0, 1])
-    )
-
-    assert len(split_session_data) == 2
-    for s in split_session_data:
-        assert len(set(s["intent_ids"][0])) == 1
-
-
-def test_train_val_split(model_data):
-    train_session_data, val_session_data = train_val_split(
-        model_data, 2, 42, "intent_ids"
-    )
-
-    for k, values in model_data.items():
-        assert len(values) == len(train_session_data[k])
-        assert len(values) == len(val_session_data[k])
-        for i, v in enumerate(values):
-            assert v[0].dtype == train_session_data[k][i][0].dtype
-
-    for values in train_session_data.values():
-        for v in values:
-            assert v.shape[0] == 3
-
-    for values in val_session_data.values():
-        for v in values:
-            assert v.shape[0] == 2
-
-
-@pytest.mark.parametrize("size", [0, 1, 5])
-def test_train_val_split_incorrect_size(model_data, size):
-    with pytest.raises(ValueError):
-        train_val_split(model_data, size, 42, "intent_ids")
-
-
-def test_session_data_for_ids(model_data):
-    filtered_session_data = session_data_for_ids(model_data, np.array([0, 1]))
-
-    for values in filtered_session_data.values():
-        for v in values:
-            assert v.shape[0] == 2
-
-    k = list(model_data.keys())[0]
-
-    assert np.all(
-        np.array(filtered_session_data[k][0][0]) == np.array(model_data[k][0][0])
-    )
-    assert np.all(
-        np.array(filtered_session_data[k][0][1]) == np.array(model_data[k][0][1])
-    )
-
-
-def test_get_number_of_examples(model_data):
-    num = get_number_of_examples(model_data)
-
-    assert num == 5
-
-
-def test_get_number_of_examples_raises_value_error(model_data):
-    model_data["dense"] = np.random.randint(5, size=(2, 10))
-    with pytest.raises(ValueError):
-        get_number_of_examples(model_data)
-
-
-def test_gen_batch(model_data):
-    iterator = gen_batch(
-        model_data, 2, "intent_ids", shuffle=True, batch_strategy="balanced"
-    )
-
-    batch = next(iterator)
-    assert len(batch) == 7
-    assert len(batch[0]) == 2
-
-    batch = next(iterator)
-    assert len(batch) == 7
-    assert len(batch[0]) == 2
-
-    batch = next(iterator)
-    assert len(batch) == 7
-    assert len(batch[0]) == 1
-
-    with pytest.raises(StopIteration):
-        next(iterator)
-
-
-def test_balance_session_data(model_data):
-    balanced_session_data = balance_session_data(model_data, 2, False, "intent_ids")
-
-    for k, values in model_data.items():
-        assert k in balanced_session_data
-
-        for i, v in enumerate(values):
-            assert len(v) == len(balanced_session_data[k][i])
-
-    assert np.all(balanced_session_data["intent_ids"][0] == np.array([0, 1, 1, 0, 1]))

From 5272532107ddac5132fcee1234a95aaf9f51ef1b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 20 Jan 2020 14:55:31 +0100
Subject: [PATCH 142/633] minor docs fix

---
 docs/nlu/choosing-a-pipeline.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 2d3f60c4bb84..ff3d374d43b1 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -66,7 +66,7 @@ examples are already very similar, the intent classified for both is highly like
 large enough training data.
 
     .. note::
-        To use ``pretrained_embeddings_convert`` pipeline, you should install ``tensorflow-text==2.1.0rc0`` and ``tensorflow-hub==0.7.0``. Otherwise, you can also pip install Rasa with ``pip install rasa[convert]``. Please also note that tensorflow-text is only currently supported on Linux platforms.
+        To use ``pretrained_embeddings_convert`` pipeline, you should install Rasa with ``pip install rasa[convert]``. Please also note that one of the dependencies(``tensorflow-text``) is currently only supported on Linux platforms.
 
 supervised_embeddings
 ~~~~~~~~~~~~~~~~~~~~~

From 16e98d76d342ba995b12ce5cfd1da2c08a14552c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 14:56:15 +0100
Subject: [PATCH 143/633] clean up

---
 .../embedding_intent_classifier.py            | 20 ++++++++++---------
 rasa/utils/tf_model_data.py                   |  3 +++
 rasa/utils/tf_models.py                       |  2 +-
 rasa/utils/train_utils.py                     |  2 +-
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index dcfe9948c0f3..41c451c78414 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -486,10 +486,7 @@ def _create_model_data(
         model_data = RasaModelData()
         model_data.add_features("text_features", [X_sparse, X_dense])
         model_data.add_features("label_features", [Y_sparse, Y_dense])
-        if label_attribute and (
-            "label_features" not in model_data.keys()
-            or not model_data.get("label_features")
-        ):
+        if label_attribute and model_data.feature_not_exists("label_features"):
             # no label features are present, get default features from _label_data
             model_data.set(
                 "label_features", self._use_default_label_features(label_ids)
@@ -592,6 +589,7 @@ def train(
             self.component_config[EVAL_NUM_EPOCHS],
             label_key="label_ids",
             batch_strategy=self.component_config[BATCH_STRATEGY],
+            random_seed=self.component_config[RANDOM_SEED],
         )
 
     # process helpers
@@ -909,7 +907,7 @@ def __init__(
         # data
         self.data_signature = data_signature
         label_batch = label_data.prepare_batch()
-        self.tf_label_data = train_utils.batch_to_session_data(
+        self.tf_label_data = train_utils.batch_to_model_data_format(
             label_batch, label_data.get_signature()
         )
         self._num_tags = len(inverted_tag_dict)
@@ -1191,7 +1189,9 @@ def _entity_loss(
     def _train_losses_scores(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
-        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.data_signature)
+        tf_batch_data = train_utils.batch_to_model_data_format(
+            batch_in, self.data_signature
+        )
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1264,8 +1264,8 @@ def eval(self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]):
         for k, v in scores.items():
             self.eval_metrics[f"val_{k}"].update_state(v)
 
-    def build_for_predict(self, session_data: RasaModelData) -> None:
-        self.batch_tuple_sizes = session_data.batch_tuple_sizes()
+    def build_for_predict(self, model_data: RasaModelData) -> None:
+        self.batch_tuple_sizes = model_data.batch_tuple_sizes()
 
         all_labels_embed, _ = self._build_all_b()
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
@@ -1273,7 +1273,9 @@ def build_for_predict(self, session_data: RasaModelData) -> None:
     def predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = train_utils.batch_to_session_data(batch_in, self.data_signature)
+        tf_batch_data = train_utils.batch_to_model_data_format(
+            batch_in, self.data_signature
+        )
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tf_model_data.py
index 5924e1ca7eab..8dad88cbcae4 100644
--- a/rasa/utils/tf_model_data.py
+++ b/rasa/utils/tf_model_data.py
@@ -34,6 +34,9 @@ def values(self):
     def keys(self):
         return self.data.keys()
 
+    def feature_not_exists(self, key: Text) -> bool:
+        return key not in self.data or not self.data[key]
+
     def split(
         self, number_of_test_examples: int, random_seed: int, label_key: Text
     ) -> Tuple["RasaModelData", "RasaModelData"]:
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 3b58e7d682f4..11368e3732a7 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -45,7 +45,7 @@ def fit(
             )
 
             model_data, evaluation_model_data = model_data.split(
-                evaluate_on_num_examples, random_seed, label_key="label_ids"
+                evaluate_on_num_examples, random_seed, label_key=label_key
             )
 
         disable = silent or is_logging_disabled()
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 6c32e5fdd459..3de8b325e726 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -24,7 +24,7 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-def batch_to_session_data(
+def batch_to_model_data_format(
     batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
     data_signature: Dict[Text, List[DataSignature]],
 ) -> Dict[Text, List[tf.Tensor]]:

From 1e3c131f1b4f17111dee7e5c6ee13c71b7b1559e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 15:00:57 +0100
Subject: [PATCH 144/633] remove not needed import

---
 rasa/utils/tf_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 11368e3732a7..877a0007dcdd 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -1,6 +1,6 @@
 import numpy as np
 import logging
-from typing import List, Optional, Text, Dict, Tuple, Union
+from typing import List, Text, Dict, Tuple, Union
 from tqdm import tqdm
 from rasa.utils import train_utils
 from rasa.utils.common import is_logging_disabled

From 8b93e36035cc6e715fb0ee8c293828459212ebc8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 20 Jan 2020 17:25:52 +0100
Subject: [PATCH 145/633] Review comments

---
 .../lexical_syntactic_featurizer.py           | 88 ++++++++++---------
 1 file changed, 45 insertions(+), 43 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index eecbb3c40726..9a750dd09240 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -76,6 +76,7 @@ def __init__(
         super().__init__(component_config)
 
         self.feature_to_idx_dict = feature_to_idx_dict or {}
+        self.number_of_features = self._calculate_number_of_features()
 
     def train(
         self,
@@ -84,63 +85,57 @@ def train(
         **kwargs: Any,
     ) -> None:
         self.feature_to_idx_dict = self._create_feature_to_idx_dict(training_data)
+        self.number_of_features = self._calculate_number_of_features()
 
         for example in training_data.training_examples:
-            self._create_text_features(example)
+            self._create_sparse_features(example)
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        self._create_text_features(message)
+        self._create_sparse_features(message)
 
-    def _create_text_features(self, message: Message) -> None:
+    def _create_sparse_features(self, message: Message) -> None:
         """Convert incoming messages into sparse features using the configured
         features."""
 
         # [:-1] to remove CLS token
         tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
 
-        features = self._tokens_to_features(tokens)
-        features = self._features_to_one_hot(features)
-        features = self._combine_with_existing_sparse_features(
-            message, features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+        sentence_features = self._tokens_to_features(tokens)
+        one_hot_feature_vector = self._features_to_one_hot(sentence_features)
+
+        sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)
+
+        sparse_features = self._combine_with_existing_sparse_features(
+            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
         )
-        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], features)
+        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], sparse_features)
 
     def _features_to_one_hot(
-        self, features: List[Dict[Text, Any]]
-    ) -> scipy.sparse.spmatrix:
+        self, sentence_features: List[Dict[Text, Any]]
+    ) -> np.ndarray:
         """Convert the word features into a one-hot presentation using the indices
         in the feature-to-idx dictionary."""
 
-        vec = self._initialize_feature_vector(len(features))
+        # +1 for CLS token
+        one_hot_feature_vector = np.zeros(
+            [len(sentence_features) + 1, self.number_of_features]
+        )
 
-        for word_idx, features in enumerate(features):
-            for feature_key, feature_value in features.items():
+        for token_idx, toke_features in enumerate(sentence_features):
+            for feature_name, feature_value in toke_features.items():
                 if (
-                    feature_key in self.feature_to_idx_dict
-                    and str(feature_value) in self.feature_to_idx_dict[feature_key]
+                    feature_name in self.feature_to_idx_dict
+                    and str(feature_value) in self.feature_to_idx_dict[feature_name]
                 ):
-                    feature_idx = self.feature_to_idx_dict[feature_key][
+                    feature_idx = self.feature_to_idx_dict[feature_name][
                         str(feature_value)
                     ]
-                    vec[word_idx][feature_idx] = 1
+                    one_hot_feature_vector[token_idx][feature_idx] = 1
 
         # set vector of CLS token to sum of everything
-        vec[-1] = np.sum(vec, axis=0)
-
-        return scipy.sparse.coo_matrix(vec)
-
-    def _initialize_feature_vector(self, number_of_tokens: int) -> np.ndarray:
-        """Initialize a feature vector of size number-of-tokens x number-of-features
-        with zeros."""
+        one_hot_feature_vector[-1] = np.sum(one_hot_feature_vector, axis=0)
 
-        number_of_features = sum(
-            [
-                len(feature_values.values())
-                for feature_values in self.feature_to_idx_dict.values()
-            ]
-        )
-        # +1 for the CLS token
-        return np.zeros([number_of_tokens + 1, number_of_features])
+        return one_hot_feature_vector
 
     def _create_feature_to_idx_dict(
         self, training_data: TrainingData
@@ -153,14 +148,14 @@ def _create_feature_to_idx_dict(
         """
 
         # get all possible feature values
-        features = []
+        all_features = []
         for example in training_data.training_examples:
             # [:-1] to remove CLS token
             tokens = example.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
-            features.append(self._tokens_to_features(tokens))
+            all_features.append(self._tokens_to_features(tokens))
 
         # build vocabulary of features
-        feature_vocabulary = self._build_feature_vocabulary(features)
+        feature_vocabulary = self._build_feature_vocabulary(all_features)
 
         # assign a unique index to each feature value
         return self._map_features_to_indices(feature_vocabulary)
@@ -181,8 +176,6 @@ def _map_features_to_indices(
             }
             offset += len(feature_values)
 
-        print(feature_to_idx_dict)
-
         return feature_to_idx_dict
 
     @staticmethod
@@ -191,9 +184,9 @@ def _build_feature_vocabulary(
     ) -> Dict[Text, List[Text]]:
         feature_vocabulary = defaultdict(set)
 
-        for sent_features in features:
-            for word_features in sent_features:
-                for feature_name, feature_value in word_features.items():
+        for sentence_features in features:
+            for token_features in sentence_features:
+                for feature_name, feature_value in token_features.items():
                     feature_vocabulary[feature_name].add(feature_value)
 
         # sort items to ensure same order every time (for tests)
@@ -205,7 +198,7 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
         """Convert words into discrete features."""
 
         configured_features = self.component_config["features"]
-        features = []
+        sentence_features = []
 
         for token_idx in range(len(tokens)):
             # get the window size (e.g. before, word, after) of the configured features
@@ -237,9 +230,9 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
                         feature, token, token_idx, pointer_position, len(tokens)
                     )
 
-            features.append(token_features)
+            sentence_features.append(token_features)
 
-        return features
+        return sentence_features
 
     def _get_feature_value(
         self,
@@ -263,6 +256,14 @@ def _get_feature_value(
             )
         return value
 
+    def _calculate_number_of_features(self) -> int:
+        return sum(
+            [
+                len(feature_values.values())
+                for feature_values in self.feature_to_idx_dict.values()
+            ]
+        )
+
     @classmethod
     def load(
         cls,
@@ -285,6 +286,7 @@ def load(
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
         """Persist this model into the passed directory.
         Return the metadata necessary to load the model again."""
+
         with open(
             os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "wb"
         ) as f:

From effa2e5f9e5cad246b05bcc273570b0ec35412ce Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 08:21:22 +0100
Subject: [PATCH 146/633] update order of methods

---
 .../lexical_syntactic_featurizer.py           | 104 +++++++++---------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 9a750dd09240..011a1785046d 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -78,6 +78,14 @@ def __init__(
         self.feature_to_idx_dict = feature_to_idx_dict or {}
         self.number_of_features = self._calculate_number_of_features()
 
+    def _calculate_number_of_features(self) -> int:
+        return sum(
+            [
+                len(feature_values.values())
+                for feature_values in self.feature_to_idx_dict.values()
+            ]
+        )
+
     def train(
         self,
         training_data: TrainingData,
@@ -93,50 +101,6 @@ def train(
     def process(self, message: Message, **kwargs: Any) -> None:
         self._create_sparse_features(message)
 
-    def _create_sparse_features(self, message: Message) -> None:
-        """Convert incoming messages into sparse features using the configured
-        features."""
-
-        # [:-1] to remove CLS token
-        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
-
-        sentence_features = self._tokens_to_features(tokens)
-        one_hot_feature_vector = self._features_to_one_hot(sentence_features)
-
-        sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)
-
-        sparse_features = self._combine_with_existing_sparse_features(
-            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        )
-        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], sparse_features)
-
-    def _features_to_one_hot(
-        self, sentence_features: List[Dict[Text, Any]]
-    ) -> np.ndarray:
-        """Convert the word features into a one-hot presentation using the indices
-        in the feature-to-idx dictionary."""
-
-        # +1 for CLS token
-        one_hot_feature_vector = np.zeros(
-            [len(sentence_features) + 1, self.number_of_features]
-        )
-
-        for token_idx, toke_features in enumerate(sentence_features):
-            for feature_name, feature_value in toke_features.items():
-                if (
-                    feature_name in self.feature_to_idx_dict
-                    and str(feature_value) in self.feature_to_idx_dict[feature_name]
-                ):
-                    feature_idx = self.feature_to_idx_dict[feature_name][
-                        str(feature_value)
-                    ]
-                    one_hot_feature_vector[token_idx][feature_idx] = 1
-
-        # set vector of CLS token to sum of everything
-        one_hot_feature_vector[-1] = np.sum(one_hot_feature_vector, axis=0)
-
-        return one_hot_feature_vector
-
     def _create_feature_to_idx_dict(
         self, training_data: TrainingData
     ) -> Dict[Text, Dict[Text, int]]:
@@ -194,6 +158,23 @@ def _build_feature_vocabulary(
 
         return feature_vocabulary
 
+    def _create_sparse_features(self, message: Message) -> None:
+        """Convert incoming messages into sparse features using the configured
+        features."""
+
+        # [:-1] to remove CLS token
+        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+
+        sentence_features = self._tokens_to_features(tokens)
+        one_hot_feature_vector = self._features_to_one_hot(sentence_features)
+
+        sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)
+
+        sparse_features = self._combine_with_existing_sparse_features(
+            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+        )
+        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], sparse_features)
+
     def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
         """Convert words into discrete features."""
 
@@ -234,6 +215,33 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
 
         return sentence_features
 
+    def _features_to_one_hot(
+        self, sentence_features: List[Dict[Text, Any]]
+    ) -> np.ndarray:
+        """Convert the word features into a one-hot presentation using the indices
+        in the feature-to-idx dictionary."""
+
+        # +1 for CLS token
+        one_hot_feature_vector = np.zeros(
+            [len(sentence_features) + 1, self.number_of_features]
+        )
+
+        for token_idx, toke_features in enumerate(sentence_features):
+            for feature_name, feature_value in toke_features.items():
+                if (
+                    feature_name in self.feature_to_idx_dict
+                    and str(feature_value) in self.feature_to_idx_dict[feature_name]
+                ):
+                    feature_idx = self.feature_to_idx_dict[feature_name][
+                        str(feature_value)
+                    ]
+                    one_hot_feature_vector[token_idx][feature_idx] = 1
+
+        # set vector of CLS token to sum of everything
+        one_hot_feature_vector[-1] = np.sum(one_hot_feature_vector, axis=0)
+
+        return one_hot_feature_vector
+
     def _get_feature_value(
         self,
         feature: Text,
@@ -256,14 +264,6 @@ def _get_feature_value(
             )
         return value
 
-    def _calculate_number_of_features(self) -> int:
-        return sum(
-            [
-                len(feature_values.values())
-                for feature_values in self.feature_to_idx_dict.values()
-            ]
-        )
-
     @classmethod
     def load(
         cls,

From 1015f444be3e86c4a42afce1542417311078d53a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 08:40:26 +0100
Subject: [PATCH 147/633] move label key to rasa model data

---
 .../embedding_intent_classifier.py            | 35 +++++------
 rasa/utils/tf_model_data.py                   | 59 ++++++++-----------
 tests/utils/test_tf_model_data.py             | 17 +++---
 3 files changed, 48 insertions(+), 63 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e856671cc1c3..b16706352bb1 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -248,7 +248,7 @@ def __init__(
 
         self._tf_config = train_utils.load_tf_config(self.component_config)
 
-        self.model_data_example = None
+        self.data_example = None
 
     # training data helpers:
     @staticmethod
@@ -483,12 +483,12 @@ def _create_model_data(
         label_ids = np.array(label_ids)
         tag_ids = np.array(tag_ids)
 
-        model_data = RasaModelData()
+        model_data = RasaModelData(label_key="label_ids")
         model_data.add_features("text_features", [X_sparse, X_dense])
         model_data.add_features("label_features", [Y_sparse, Y_dense])
         if label_attribute and model_data.feature_not_exists("label_features"):
             # no label features are present, get default features from _label_data
-            model_data.set(
+            model_data.add_features(
                 "label_features", self._use_default_label_features(label_ids)
             )
 
@@ -564,9 +564,7 @@ def train(
                 return
 
         # keep one example for persisting and loading
-        self.model_data_example = {
-            k: [v[:1] for v in vs] for k, vs in model_data.items()
-        }
+        self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
         # TODO set it in the model
         # set random seed
@@ -599,7 +597,7 @@ def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
 
         # create session data from message and convert it into a batch of 1
         model_data = self._create_model_data([message])
-        predict_dataset = model_data.as_tf_dataset(1, label_key="label_ids")
+        predict_dataset = model_data.as_tf_dataset(1)
         batch_in = next(iter(predict_dataset))
 
         return self.predict_func(batch_in)
@@ -739,10 +737,8 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
 
         self.model.save_weights(tf_model_file, save_format="tf")
 
-        with open(
-            os.path.join(model_dir, file_name + ".model_data_example.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.model_data_example, f)
+        with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "wb") as f:
+            pickle.dump(self.data_example, f)
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "wb") as f:
             pickle.dump(self._label_data, f)
@@ -789,10 +785,10 @@ def load(
         # with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
         #    _tf_config = pickle.load(f)
 
-        with open(
-            os.path.join(model_dir, file_name + ".model_data_example.pkl"), "rb"
-        ) as f:
-            model_data_example = RasaModelData(pickle.load(f))
+        with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "rb") as f:
+            model_data_example = RasaModelData(
+                label_key="label_ids", data=pickle.load(f)
+            )
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
             label_data = pickle.load(f)
@@ -827,20 +823,21 @@ def load(
             0,
             label_key="label_ids",
             batch_strategy=meta[BATCH_STRATEGY],
-            silent=True,
-            eager=True,
+            silent=True,  # don't confuse users with training output
+            eager=True,  # no need to build tf graph, eager is faster here
         )
         model.load_weights(tf_model_file)
 
         # build the graph for prediction
         model.set_training_phase(False)
         model_data = RasaModelData(
-            {k: vs for k, vs in model_data_example.items() if "text" in k}
+            label_key="label_ids",
+            data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
         model.data_signature = model_data.get_signature()
         model.build_for_predict(model_data)
         predict_dataset = model_data.as_tf_dataset(
-            1, label_key="label_ids", batch_strategy="sequence", shuffle=False
+            1, batch_strategy="sequence", shuffle=False
         )
         predict_func = tf.function(
             func=model.predict, input_signature=[predict_dataset.element_spec]
diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tf_model_data.py
index 8dad88cbcae4..01698a1f2e97 100644
--- a/rasa/utils/tf_model_data.py
+++ b/rasa/utils/tf_model_data.py
@@ -13,18 +13,17 @@ class DataSignature(NamedTuple):
 
 
 class RasaModelData:
-    def __init__(self, data: Optional[Dict[Text, List[np.ndarray]]] = None):
-        if data is None:
-            self.data = {}
-        else:
-            self.data = data
+    def __init__(
+        self,
+        label_key: Optional[Text] = None,
+        data: Optional[Dict[Text, List[np.ndarray]]] = None,
+    ):
+        self.data = data or {}
+        self.label_key = label_key or ""
 
     def get(self, key: Text) -> List[np.ndarray]:
         return self.data[key]
 
-    def set(self, key: Text, value: List[np.ndarray]):
-        self.data[key] = value
-
     def items(self):
         return self.data.items()
 
@@ -38,13 +37,13 @@ def feature_not_exists(self, key: Text) -> bool:
         return key not in self.data or not self.data[key]
 
     def split(
-        self, number_of_test_examples: int, random_seed: int, label_key: Text
+        self, number_of_test_examples: int, random_seed: int
     ) -> Tuple["RasaModelData", "RasaModelData"]:
         """Create random hold out test set using stratified split."""
 
-        self._check_label_key(label_key)
+        self._check_label_key(self.label_key)
 
-        label_ids = self._create_label_ids(self.data[label_key][0])
+        label_ids = self._create_label_ids(self.data[self.label_key][0])
         label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
 
         self._check_train_test_sizes(number_of_test_examples, label_counts)
@@ -120,7 +119,7 @@ def shuffle(self) -> None:
         ids = np.random.permutation(data_points)
         self.data = self._data_for_ids(ids)
 
-    def balance(self, batch_size: int, shuffle: bool, label_key: Text) -> None:
+    def balance(self, batch_size: int, shuffle: bool) -> None:
         """Mix session data to account for class imbalance.
 
         This batching strategy puts rare classes approximately in every other batch,
@@ -128,10 +127,10 @@ def balance(self, batch_size: int, shuffle: bool, label_key: Text) -> None:
         that more populated classes should appear more often.
         """
 
-        if label_key not in self.data or len(self.data[label_key]) > 1:
-            raise ValueError(f"Key '{label_key}' not in RasaModelData.")
+        if self.label_key not in self.data or len(self.data[self.label_key]) > 1:
+            raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
 
-        label_ids = self._create_label_ids(self.data[label_key][0])
+        label_ids = self._create_label_ids(self.data[self.label_key][0])
 
         unique_label_ids, counts_label_ids = np.unique(
             label_ids, return_counts=True, axis=0
@@ -217,20 +216,14 @@ def get_feature_dimension(self, key: Text) -> int:
         return number_of_features
 
     def convert_to_tf_dataset(
-        self,
-        batch_size: int,
-        label_key: Text,
-        batch_strategy: Text = "sequence",
-        shuffle: bool = False,
+        self, batch_size: int, batch_strategy: Text = "sequence", shuffle: bool = False
     ):
         """Create tf dataset."""
 
         shapes, types = self._get_shapes_types()
 
         return tf.data.Dataset.from_generator(
-            lambda batch_size_: self._gen_batch(
-                batch_size_, label_key, batch_strategy, shuffle
-            ),
+            lambda batch_size_: self._gen_batch(batch_size_, batch_strategy, shuffle),
             output_types=types,
             output_shapes=shapes,
             args=([batch_size]),
@@ -294,7 +287,6 @@ def batch_tuple_sizes(self) -> Dict[Text, int]:
     def as_tf_dataset(
         self,
         batch_size: Union["tf.Tensor", int],
-        label_key: Text,
         batch_strategy: Text = "sequence",
         shuffle: bool = False,
     ) -> "tf.data.Dataset":
@@ -303,9 +295,7 @@ def as_tf_dataset(
         shapes, types = self._get_shapes_types()
 
         return tf.data.Dataset.from_generator(
-            lambda batch_size_: self._gen_batch(
-                batch_size_, label_key, batch_strategy, shuffle
-            ),
+            lambda batch_size_: self._gen_batch(batch_size_, batch_strategy, shuffle),
             output_types=types,
             output_shapes=shapes,
             args=([batch_size]),
@@ -347,11 +337,7 @@ def append_type(features: np.ndarray):
         return tuple(shapes), tuple(types)
 
     def _gen_batch(
-        self,
-        batch_size: int,
-        label_key: Text,
-        batch_strategy: Text = "sequence",
-        shuffle: bool = False,
+        self, batch_size: int, batch_strategy: Text = "sequence", shuffle: bool = False
     ) -> Generator[Tuple, None, None]:
         """Generate batches."""
 
@@ -359,7 +345,7 @@ def _gen_batch(
             self.shuffle()
 
         if batch_strategy == "balanced":
-            self.balance(batch_size, shuffle, label_key)
+            self.balance(batch_size, shuffle)
 
         num_examples = self.get_number_of_examples()
         num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
@@ -406,7 +392,7 @@ def _split_by_label_ids(
         label_data = []
         for label_id in unique_label_ids:
             ids = label_ids == label_id
-            label_data.append(RasaModelData(self._data_for_ids(ids)))
+            label_data.append(RasaModelData(self.label_key, self._data_for_ids(ids)))
         return label_data
 
     def _check_label_key(self, label_key: Text):
@@ -441,7 +427,10 @@ def _convert_train_test_split(
                 data_val[key].append(output_values[(index * 2) + 1])
                 index += 1
 
-        return RasaModelData(data_train), RasaModelData(data_val)
+        return (
+            RasaModelData(self.label_key, data_train),
+            RasaModelData(self.label_key, data_val),
+        )
 
     @staticmethod
     def _combine_features(
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index 6708c0f89d5a..e8d28fc5dcc2 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -8,7 +8,8 @@
 @pytest.fixture
 async def model_data() -> RasaModelData:
     return RasaModelData(
-        {
+        label_key="intent_ids",
+        data={
             "text_features": [
                 np.array(
                     [
@@ -52,7 +53,7 @@ async def model_data() -> RasaModelData:
                     ]
                 )
             ],
-        }
+        },
     )
 
 
@@ -75,7 +76,7 @@ def test_split_session_data_by_label(model_data: RasaModelData):
 
 
 def test_train_val_split(model_data: RasaModelData):
-    train_model_data, test_model_data = model_data.split(2, 42, "intent_ids")
+    train_model_data, test_model_data = model_data.split(2, 42)
 
     for k, values in model_data.items():
         assert len(values) == len(train_model_data.get(k))
@@ -95,7 +96,7 @@ def test_train_val_split(model_data: RasaModelData):
 @pytest.mark.parametrize("size", [0, 1, 5])
 def test_train_val_split_incorrect_size(model_data: RasaModelData, size: int):
     with pytest.raises(ValueError):
-        model_data.split(size, 42, "intent_ids")
+        model_data.split(size, 42)
 
 
 def test_session_data_for_ids(model_data: RasaModelData):
@@ -116,15 +117,13 @@ def test_get_number_of_examples(model_data: RasaModelData):
 
 
 def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
-    model_data.set("dense", [np.random.randint(5, size=(2, 10))])
+    model_data.add_features("dense", [np.random.randint(5, size=(2, 10))])
     with pytest.raises(ValueError):
         model_data.get_number_of_examples()
 
 
 def test_gen_batch(model_data: RasaModelData):
-    iterator = model_data._gen_batch(
-        2, "intent_ids", shuffle=True, batch_strategy="balanced"
-    )
+    iterator = model_data._gen_batch(2, shuffle=True, batch_strategy="balanced")
 
     batch = next(iterator)
     assert len(batch) == 7
@@ -143,6 +142,6 @@ def test_gen_batch(model_data: RasaModelData):
 
 
 def test_balance_session_data(model_data: RasaModelData):
-    model_data.balance(2, False, "intent_ids")
+    model_data.balance(2, False)
 
     assert np.all(model_data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))

From c480f06c767df322851f020c3b1c6060edf1b2f0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 12:38:09 +0100
Subject: [PATCH 148/633] review comments

---
 .../embedding_intent_classifier.py            | 19 +++-----
 rasa/utils/tf_model_data.py                   |  6 +--
 rasa/utils/tf_models.py                       | 48 ++++++++++++++++---
 rasa/utils/train_utils.py                     | 36 +-------------
 4 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b16706352bb1..8e1463c8ff8b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -25,7 +25,7 @@
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
 )
-from rasa.utils.tf_model_data import RasaModelData, DataSignature
+from rasa.utils.tf_model_data import RasaModelData, FeatureSignature
 
 import tensorflow as tf
 import tensorflow_addons as tfa
@@ -821,7 +821,6 @@ def load(
             1,
             0,
             0,
-            label_key="label_ids",
             batch_strategy=meta[BATCH_STRATEGY],
             silent=True,  # don't confuse users with training output
             eager=True,  # no need to build tf graph, eager is faster here
@@ -859,7 +858,7 @@ def load(
 class DIET(tf_models.RasaModel):
     @staticmethod
     def _create_sparse_dense_layer(
-        data_signature: List[DataSignature],
+        data_signature: List[FeatureSignature],
         name: Text,
         reg_lambda: float,
         dense_dim: int,
@@ -880,7 +879,7 @@ def _create_sparse_dense_layer(
             )
 
     @staticmethod
-    def _input_dim(data_signature: List[DataSignature], dense_dim: int) -> int:
+    def _input_dim(data_signature: List[FeatureSignature], dense_dim: int) -> int:
 
         for is_sparse, shape in data_signature:
             if not is_sparse:
@@ -893,7 +892,7 @@ def _input_dim(data_signature: List[DataSignature], dense_dim: int) -> int:
 
     def __init__(
         self,
-        data_signature: Dict[Text, List[DataSignature]],
+        data_signature: Dict[Text, List[FeatureSignature]],
         label_data: RasaModelData,
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
@@ -903,7 +902,7 @@ def __init__(
         # data
         self.data_signature = data_signature
         label_batch = label_data.prepare_batch()
-        self.tf_label_data = train_utils.batch_to_model_data_format(
+        self.tf_label_data = self.batch_to_model_data_format(
             label_batch, label_data.get_signature()
         )
         self._num_tags = len(inverted_tag_dict)
@@ -1185,9 +1184,7 @@ def _entity_loss(
     def _train_losses_scores(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
-        tf_batch_data = train_utils.batch_to_model_data_format(
-            batch_in, self.data_signature
-        )
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
@@ -1241,9 +1238,7 @@ def build_for_predict(self, model_data: RasaModelData) -> None:
     def predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = train_utils.batch_to_model_data_format(
-            batch_in, self.data_signature
-        )
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tf_model_data.py
index 01698a1f2e97..5607b5d81906 100644
--- a/rasa/utils/tf_model_data.py
+++ b/rasa/utils/tf_model_data.py
@@ -7,7 +7,7 @@
 from collections import defaultdict
 
 
-class DataSignature(NamedTuple):
+class FeatureSignature(NamedTuple):
     is_sparse: bool
     shape: List[int]
 
@@ -95,7 +95,7 @@ def add_mask(self, key: Text, from_key: Text):
                 self.data[key].append(mask)
                 break
 
-    def get_signature(self) -> Dict[Text, List[DataSignature]]:
+    def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
         """Get signature of RasaModelData.
 
         Signature stores the shape and whether features are sparse or not for every
@@ -103,7 +103,7 @@ def get_signature(self) -> Dict[Text, List[DataSignature]]:
 
         return {
             key: [
-                DataSignature(
+                FeatureSignature(
                     True if isinstance(v[0], scipy.sparse.spmatrix) else False,
                     v[0].shape,
                 )
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index c48549178cb0..60bb3aad470d 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 import numpy as np
 import logging
 from typing import List, Text, Dict, Tuple, Union
@@ -6,7 +8,7 @@
 from rasa.utils.common import is_logging_disabled
 import tensorflow as tf
 
-from rasa.utils.tf_model_data import RasaModelData
+from rasa.utils.tf_model_data import RasaModelData, FeatureSignature
 
 logger = logging.getLogger(__name__)
 
@@ -32,7 +34,6 @@ def fit(
         batch_size: Union[List[int], int],
         evaluate_on_num_examples: int,
         evaluate_every_num_epochs: int,
-        label_key: Text,
         batch_strategy: Text,
         silent: bool = False,
         eager: bool = False,
@@ -48,7 +49,7 @@ def fit(
             )
 
             model_data, evaluation_model_data = model_data.split(
-                evaluate_on_num_examples, random_seed, label_key=label_key
+                evaluate_on_num_examples, random_seed
             )
 
         disable = silent or is_logging_disabled()
@@ -57,12 +58,10 @@ def fit(
         tf_batch_size = tf.ones((), tf.int32)
 
         def train_dataset_function(x):
-            return model_data.as_tf_dataset(x, label_key, batch_strategy, shuffle=True)
+            return model_data.as_tf_dataset(x, batch_strategy, shuffle=True)
 
         def evaluation_dataset_function(x):
-            return evaluation_model_data.as_tf_dataset(
-                x, label_key, batch_strategy, shuffle=False
-            )
+            return evaluation_model_data.as_tf_dataset(x, batch_strategy, shuffle=False)
 
         if eager:
             # allows increasing batch size
@@ -193,3 +192,38 @@ def evaluate_generator(self, **kwargs) -> None:
 
     def predict_generator(self, **kwargs) -> None:
         raise NotImplementedError
+
+    @staticmethod
+    def batch_to_model_data_format(
+        batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
+        data_signature: Dict[Text, List[FeatureSignature]],
+    ) -> Dict[Text, List[tf.Tensor]]:
+        """Convert input batch tensors into batch data format.
+    
+        Batch contains any number of batch data. The order is equal to the
+        key-value pairs in session data. As sparse data were converted into indices, data,
+        shape before, this methods converts them into sparse tensors. Dense data is
+        kept.
+        """
+
+        batch_data = defaultdict(list)
+
+        idx = 0
+        for k, signature in data_signature.items():
+            for is_sparse, shape in signature:
+                if is_sparse:
+                    # explicitly substitute last dimension in shape with known
+                    # static value
+                    batch_data[k].append(
+                        tf.SparseTensor(
+                            batch[idx],
+                            batch[idx + 1],
+                            [batch[idx + 2][0], batch[idx + 2][1], shape[-1]],
+                        )
+                    )
+                    idx += 3
+                else:
+                    batch_data[k].append(batch[idx])
+                    idx += 1
+
+        return batch_data
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 3de8b325e726..357bcdcc7765 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -4,7 +4,7 @@
 import numpy as np
 import tensorflow as tf
 
-from rasa.utils.tf_model_data import DataSignature
+from rasa.utils.tf_model_data import FeatureSignature
 
 logger = logging.getLogger(__name__)
 
@@ -24,40 +24,6 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-def batch_to_model_data_format(
-    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
-    data_signature: Dict[Text, List[DataSignature]],
-) -> Dict[Text, List[tf.Tensor]]:
-    """Convert input batch tensors into batch data format.
-
-    Batch contains any number of batch data. The order is equal to the
-    key-value pairs in session data. As sparse data were converted into indices, data,
-    shape before, this methods converts them into sparse tensors. Dense data is
-    kept.
-    """
-
-    batch_data = defaultdict(list)
-
-    idx = 0
-    for k, signature in data_signature.items():
-        for is_sparse, shape in signature:
-            if is_sparse:
-                # explicitly substitute last dimension in shape with known static value
-                batch_data[k].append(
-                    tf.SparseTensor(
-                        batch[idx],
-                        batch[idx + 1],
-                        [batch[idx + 2][0], batch[idx + 2][1], shape[-1]],
-                    )
-                )
-                idx += 3
-            else:
-                batch_data[k].append(batch[idx])
-                idx += 1
-
-    return batch_data
-
-
 def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
     if similarity_type == "cosine":
         # clip negative values to zero

From 5917965c19a4ee009c149f34b0ba755c7b950bec Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 13:41:14 +0100
Subject: [PATCH 149/633] review comments

---
 .../classifiers/embedding_intent_classifier.py |  1 -
 rasa/utils/tf_models.py                        | 18 ++++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8e1463c8ff8b..a8514bc8f21f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -585,7 +585,6 @@ def train(
             self.component_config[BATCH_SIZES],
             self.component_config[EVAL_NUM_EXAMPLES],
             self.component_config[EVAL_NUM_EPOCHS],
-            label_key="label_ids",
             batch_strategy=self.component_config[BATCH_STRATEGY],
             random_seed=self.component_config[RANDOM_SEED],
         )
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 60bb3aad470d..be99b52ce90e 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -57,11 +57,13 @@ def fit(
 
         tf_batch_size = tf.ones((), tf.int32)
 
-        def train_dataset_function(x):
-            return model_data.as_tf_dataset(x, batch_strategy, shuffle=True)
+        def train_dataset_function(_batch_size):
+            return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
 
-        def evaluation_dataset_function(x):
-            return evaluation_model_data.as_tf_dataset(x, batch_strategy, shuffle=False)
+        def evaluation_dataset_function(_batch_size):
+            return evaluation_model_data.as_tf_dataset(
+                _batch_size, batch_strategy, shuffle=False
+            )
 
         if eager:
             # allows increasing batch size
@@ -78,18 +80,18 @@ def evaluation_dataset_function(x):
         if evaluate_on_num_examples > 0:
             if eager:
                 tf_evaluation_dataset_function = evaluation_dataset_function
-                tf_evaluation_function = self.evaluate_on_batch
+                tf_evaluation_on_batch_function = self.evaluate_on_batch
             else:
                 tf_evaluation_dataset_function = tf.function(
                     func=evaluation_dataset_function
                 )
-                tf_evaluation_function = tf.function(
+                tf_evaluation_on_batch_function = tf.function(
                     self.evaluate_on_batch,
                     input_signature=[tf_evaluation_dataset_function(1).element_spec],
                 )
         else:
             tf_evaluation_dataset_function = None
-            tf_evaluation_function = None
+            tf_evaluation_on_batch_function = None
 
         for ep in pbar:
             ep_batch_size = tf_batch_size * train_utils.linearly_increasing_batch_size(
@@ -123,7 +125,7 @@ def evaluation_dataset_function(x):
                     # Eval on batches
                     self.set_training_phase(False)
                     for batch_in in tf_evaluation_dataset_function(ep_batch_size):
-                        tf_evaluation_function(batch_in)
+                        tf_evaluation_on_batch_function(batch_in)
 
                 # Get the metric results
                 postfix_dict.update(

From 6329a8890076ca95d093620cacd4dc449ea395d5 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 13:51:15 +0100
Subject: [PATCH 150/633] Update rasa/utils/tf_models.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 rasa/utils/tf_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index be99b52ce90e..4ad61f5e7a2f 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -37,7 +37,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         eager: bool = False,
-        random_seed: int = 42,
+        random_seed: Optional[int] = None,
         **kwargs,
     ) -> None:
         """Train tf graph"""

From 7ff85249d2d6c49dfee881e3343f3a604066e6d8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 13:54:59 +0100
Subject: [PATCH 151/633] move increase batch size method

---
 rasa/utils/tf_models.py   | 22 ++++++++++++++++++++--
 rasa/utils/train_utils.py | 22 ----------------------
 2 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index 4ad61f5e7a2f..ee7ec9fe986c 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -4,7 +4,6 @@
 import logging
 from typing import List, Text, Dict, Tuple, Union
 from tqdm import tqdm
-from rasa.utils import train_utils
 from rasa.utils.common import is_logging_disabled
 import tensorflow as tf
 
@@ -94,7 +93,7 @@ def evaluation_dataset_function(_batch_size):
             tf_evaluation_on_batch_function = None
 
         for ep in pbar:
-            ep_batch_size = tf_batch_size * train_utils.linearly_increasing_batch_size(
+            ep_batch_size = tf_batch_size * self.linearly_increasing_batch_size(
                 ep, batch_size, epochs
             )
 
@@ -229,3 +228,22 @@ def batch_to_model_data_format(
                     idx += 1
 
         return batch_data
+
+    @staticmethod
+    def linearly_increasing_batch_size(
+        epoch: int, batch_size: Union[List[int], int], epochs: int
+    ) -> int:
+        """Linearly increase batch size with every epoch.
+
+        The idea comes from https://arxiv.org/abs/1711.00489.
+        """
+
+        if not isinstance(batch_size, list):
+            return int(batch_size)
+
+        if epochs > 1:
+            return int(
+                batch_size[0] + epoch * (batch_size[1] - batch_size[0]) / (epochs - 1)
+            )
+        else:
+            return int(batch_size[0])
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 357bcdcc7765..ecc7dd563510 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,10 +1,7 @@
-from collections import defaultdict
 import logging
 from typing import List, Optional, Text, Dict, Tuple, Union, Any, NamedTuple
-import numpy as np
 import tensorflow as tf
 
-from rasa.utils.tf_model_data import FeatureSignature
 
 logger = logging.getLogger(__name__)
 
@@ -33,25 +30,6 @@ def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
         return tf.nn.softmax(sim)
 
 
-def linearly_increasing_batch_size(
-    epoch: int, batch_size: Union[List[int], int], epochs: int
-) -> int:
-    """Linearly increase batch size with every epoch.
-
-    The idea comes from https://arxiv.org/abs/1711.00489.
-    """
-
-    if not isinstance(batch_size, list):
-        return int(batch_size)
-
-    if epochs > 1:
-        return int(
-            batch_size[0] + epoch * (batch_size[1] - batch_size[0]) / (epochs - 1)
-        )
-    else:
-        return int(batch_size[0])
-
-
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
     """Extract attention probabilities from t2t dict"""
 

From 02ecea844d9047753d30280fc6d8983f8e61a086 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 14:03:04 +0100
Subject: [PATCH 152/633] clean up imports

---
 .../classifiers/embedding_intent_classifier.py   | 16 +++++-----------
 rasa/utils/tf_models.py                          |  8 +++-----
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a8514bc8f21f..3f56eaf5c14b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,10 +1,11 @@
 import logging
-
 import numpy as np
 import os
 import pickle
 import scipy.sparse
 import warnings
+import tensorflow as tf
+import tensorflow_addons as tfa
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, Callable
 
@@ -13,7 +14,6 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import any_of
-
 from rasa.utils import train_utils
 from rasa.utils import tf_layers
 from rasa.utils import tf_models
@@ -26,22 +26,16 @@
     TOKENS_NAMES,
 )
 from rasa.utils.tf_model_data import RasaModelData, FeatureSignature
-
-import tensorflow as tf
-import tensorflow_addons as tfa
-
-
-logger = logging.getLogger(__name__)
-
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
 
 
-shapes, types = None, None
+logger = logging.getLogger(__name__)
+
 
-# constants
+# constants - configuration parameters
 HIDDEN_LAYERS_SIZES_TEXT = "hidden_layers_sizes_text"
 HIDDEN_LAYERS_SIZES_LABEL = "hidden_layers_sizes_label"
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tf_models.py
index ee7ec9fe986c..57ab8321246b 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tf_models.py
@@ -1,12 +1,10 @@
-from collections import defaultdict
-
+import tensorflow as tf
 import numpy as np
 import logging
-from typing import List, Text, Dict, Tuple, Union
+from collections import defaultdict
+from typing import List, Text, Dict, Tuple, Union, Optional
 from tqdm import tqdm
 from rasa.utils.common import is_logging_disabled
-import tensorflow as tf
-
 from rasa.utils.tf_model_data import RasaModelData, FeatureSignature
 
 logger = logging.getLogger(__name__)

From fbc9e30e9c0b3e683ae7b721d3136dc3736ef849 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 21 Jan 2020 15:51:18 +0100
Subject: [PATCH 153/633] create tensorflow package

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 5 ++---
 rasa/utils/tensorflow/__init__.py                   | 0
 rasa/utils/{ => tensorflow}/tf_layers.py            | 0
 rasa/utils/{ => tensorflow}/tf_model_data.py        | 0
 rasa/utils/{ => tensorflow}/tf_models.py            | 2 +-
 tests/utils/test_tf_model_data.py                   | 2 +-
 6 files changed, 4 insertions(+), 5 deletions(-)
 create mode 100644 rasa/utils/tensorflow/__init__.py
 rename rasa/utils/{ => tensorflow}/tf_layers.py (100%)
 rename rasa/utils/{ => tensorflow}/tf_model_data.py (100%)
 rename rasa/utils/{ => tensorflow}/tf_models.py (99%)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3f56eaf5c14b..32466e78b887 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -15,8 +15,8 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import any_of
 from rasa.utils import train_utils
-from rasa.utils import tf_layers
-from rasa.utils import tf_models
+from rasa.utils.tensorflow import tf_layers, tf_models
+from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -25,7 +25,6 @@
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
 )
-from rasa.utils.tf_model_data import RasaModelData, FeatureSignature
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.model import Metadata
diff --git a/rasa/utils/tensorflow/__init__.py b/rasa/utils/tensorflow/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rasa/utils/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
similarity index 100%
rename from rasa/utils/tf_layers.py
rename to rasa/utils/tensorflow/tf_layers.py
diff --git a/rasa/utils/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
similarity index 100%
rename from rasa/utils/tf_model_data.py
rename to rasa/utils/tensorflow/tf_model_data.py
diff --git a/rasa/utils/tf_models.py b/rasa/utils/tensorflow/tf_models.py
similarity index 99%
rename from rasa/utils/tf_models.py
rename to rasa/utils/tensorflow/tf_models.py
index 57ab8321246b..babc777d7da5 100644
--- a/rasa/utils/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -5,7 +5,7 @@
 from typing import List, Text, Dict, Tuple, Union, Optional
 from tqdm import tqdm
 from rasa.utils.common import is_logging_disabled
-from rasa.utils.tf_model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index e8d28fc5dcc2..12a03997e738 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -2,7 +2,7 @@
 import scipy.sparse
 import numpy as np
 
-from rasa.utils.tf_model_data import RasaModelData
+from rasa.utils.tensorflow.tf_model_data import RasaModelData
 
 
 @pytest.fixture

From 65448df7a2ca351c4f42728eec73fa8c4814b9e5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 22 Jan 2020 12:10:44 +0100
Subject: [PATCH 154/633] fix types

---
 rasa/utils/tensorflow/tf_layers.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 5fd8f82dd68b..5a6e00c26605 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -8,7 +8,7 @@
 
 
 class SparseDropout(tf.keras.layers.Dropout):
-    def call(self, inputs: tf.Tensor, training: bool) -> tf.Tensor:
+    def call(self, inputs: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
 
         to_retain_prob = tf.random.uniform(
             tf.shape(inputs.values), 0, 1, inputs.values.dtype
@@ -80,7 +80,7 @@ def __init__(
             )
             self._ffn_layers.append(tf.keras.layers.Dropout(rate=droprate))
 
-    def call(self, x: tf.Tensor, training: bool) -> tf.Tensor:
+    def call(self, x: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
         for layer in self._ffn_layers:
             x = layer(x, training=training)
 
@@ -166,7 +166,7 @@ def _scaled_dot_product_attention(q, k, v, pad_mask):
 
         return output, attention_weights
 
-    def __init__(self, d_model, num_heads: int, reg_lambda: float) -> None:
+    def __init__(self, d_model: int, num_heads: int, reg_lambda: float) -> None:
         super(MultiHeadAttention, self).__init__()
         self.num_heads = num_heads
         self.d_model = d_model
@@ -244,9 +244,9 @@ def call(
 class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(
         self,
-        d_model: tf.Tensor,
+        d_model: int,
         num_heads: int,
-        dff: tf.Tensor,
+        dff: int,
         reg_lambda: float,
         rate: float = 0.1,
     ) -> None:
@@ -269,7 +269,7 @@ def __init__(
             tf.keras.layers.Dropout(rate),
         ]
 
-    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: bool) -> tf.Tensor:
+    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
 
         x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
         attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
@@ -316,9 +316,9 @@ def _positional_encoding(cls, position, d_model) -> tf.Tensor:
     def __init__(
         self,
         num_layers: int,
-        d_model,
+        d_model: int,
         num_heads: int,
-        dff,
+        dff: int,
         max_seq_length: int,
         reg_lambda: float,
         rate: float = 0.1,
@@ -346,7 +346,7 @@ def __init__(
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
     def call(
-        self, x: "tf.Tensor", pad_mask: "tf.Tensor", training: bool
+        self, x: "tf.Tensor", pad_mask: "tf.Tensor", training: tf.Tensor
     ) -> "tf.Tensor":
 
         # adding embedding and position encoding.
@@ -373,7 +373,7 @@ def call(
 
 
 class InputMask(tf.keras.layers.Layer):
-    def build(self, input_shape: List[int]) -> None:
+    def build(self, input_shape: "tf.TensorShape") -> None:
         initializer = tf.keras.initializers.GlorotUniform()
         self.mask_vector = self.add_weight(
             shape=(1, 1, input_shape[-1]),
@@ -384,7 +384,7 @@ def build(self, input_shape: List[int]) -> None:
         self.built = True
 
     def call(
-        self, x: "tf.Tensor", mask: "tf.Tensor", training: bool
+        self, x: "tf.Tensor", mask: "tf.Tensor", training: "tf.Tensor"
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Randomly mask input sequences."""
 

From e88ce39e97a3600ac6fe88a82298fbdbcc12704b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 22 Jan 2020 13:21:12 +0100
Subject: [PATCH 155/633] fix quates in types

---
 rasa/utils/tensorflow/tf_layers.py | 110 ++++++++++++++---------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 5a6e00c26605..e2f935b5e698 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -346,8 +346,8 @@ def __init__(
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
     def call(
-        self, x: "tf.Tensor", pad_mask: "tf.Tensor", training: tf.Tensor
-    ) -> "tf.Tensor":
+        self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor
+    ) -> tf.Tensor:
 
         # adding embedding and position encoding.
         x = self._embedding(x)  # (batch_size, seq_len, d_model)
@@ -384,8 +384,8 @@ def build(self, input_shape: "tf.TensorShape") -> None:
         self.built = True
 
     def call(
-        self, x: "tf.Tensor", mask: "tf.Tensor", training: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        self, x: tf.Tensor, mask: tf.Tensor, training: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Randomly mask input sequences."""
 
         # do not substitute with cls token
@@ -440,7 +440,7 @@ def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
             name="transitions",
         )
 
-    def call(self, logits: "tf.Tensor", sequence_lengths: "tf.Tensor") -> "tf.TEnsor":
+    def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
         pred_ids, _ = tfa.text.crf.crf_decode(
             logits, self.transition_params, sequence_lengths
         )
@@ -453,10 +453,10 @@ def call(self, logits: "tf.Tensor", sequence_lengths: "tf.Tensor") -> "tf.TEnsor
 
     def loss(
         self,
-        logits: "tf.Tensor",
-        tag_indices: "tf.Tensor",
-        sequence_lengths: "tf.Tensor",
-    ) -> "tf.Tensor":
+        logits: tf.Tensor,
+        tag_indices: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
         log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
             logits, tag_indices, sequence_lengths, self.transition_params
         )
@@ -485,12 +485,12 @@ def __init__(
         self.scale_loss = scale_loss
 
     @staticmethod
-    def _make_flat(x: "tf.Tensor") -> "tf.Tensor":
+    def _make_flat(x: tf.Tensor) -> tf.Tensor:
         """Make tensor 2D."""
 
         return tf.reshape(x, (-1, x.shape[-1]))
 
-    def _random_indices(self, batch_size: "tf.Tensor", total_candidates: "tf.Tensor"):
+    def _random_indices(self, batch_size: tf.Tensor, total_candidates: tf.Tensor):
         def rand_idxs():
             """Create random tensor of indices"""
             # (1, num_neg)
@@ -529,8 +529,8 @@ def body(i, out):
 
     @staticmethod
     def _sample_idxs(
-        batch_size: "tf.Tensor", x: "tf.Tensor", idxs: "tf.Tensor"
-    ) -> "tf.Tensor":
+        batch_size: tf.Tensor, x: tf.Tensor, idxs: tf.Tensor
+    ) -> tf.Tensor:
         """Sample negative examples for given indices"""
 
         tiled = tf.tile(tf.expand_dims(x, 0), (batch_size, 1, 1))
@@ -538,8 +538,8 @@ def _sample_idxs(
         return tf.gather(tiled, idxs, batch_dims=1)
 
     def _get_bad_mask(
-        self, labels: "tf.Tensor", target_labels: "tf.Tensor", idxs: "tf.Tensor"
-    ) -> "tf.Tensor":
+        self, labels: tf.Tensor, target_labels: tf.Tensor, idxs: tf.Tensor
+    ) -> tf.Tensor:
         """Calculate bad mask for given indices.
 
         Checks that input features are different for positive negative samples.
@@ -553,8 +553,8 @@ def _get_bad_mask(
         )
 
     def _get_negs(
-        self, embeds: "tf.Tensor", labels: "tf.Tensor", target_labels: "tf.Tensor"
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        self, embeds: tf.Tensor, labels: tf.Tensor, target_labels: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Get negative examples from given tensor."""
 
         embeds_flat = self._make_flat(embeds)
@@ -580,13 +580,13 @@ def _get_negs(
 
     def _sample_negatives(
         self,
-        inputs_embed: "tf.Tensor",
-        labels_embed: "tf.Tensor",
-        labels: "tf.Tensor",
-        all_labels_embed: "tf.Tensor",
-        all_labels: "tf.Tensor",
+        inputs_embed: tf.Tensor,
+        labels_embed: tf.Tensor,
+        labels: tf.Tensor,
+        all_labels_embed: tf.Tensor,
+        all_labels: tf.Tensor,
     ) -> Tuple[
-        "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"
+        tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor
     ]:
         """Sample negative examples."""
 
@@ -610,8 +610,8 @@ def _sample_negatives(
 
     @staticmethod
     def sim(
-        a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"] = None
-    ) -> "tf.Tensor":
+        a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None
+    ) -> tf.Tensor:
         """Calculate similarity between given tensors."""
 
         sim = tf.reduce_sum(a * b, -1)
@@ -622,14 +622,14 @@ def sim(
 
     def _train_sim(
         self,
-        pos_inputs_embed: "tf.Tensor",
-        pos_labels_embed: "tf.Tensor",
-        neg_inputs_embed: "tf.Tensor",
-        neg_labels_embed: "tf.Tensor",
-        inputs_bad_negs: "tf.Tensor",
-        labels_bad_negs: "tf.Tensor",
-        mask: Optional["tf.Tensor"],
-    ) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
+        pos_inputs_embed: tf.Tensor,
+        pos_labels_embed: tf.Tensor,
+        neg_inputs_embed: tf.Tensor,
+        neg_labels_embed: tf.Tensor,
+        inputs_bad_negs: tf.Tensor,
+        labels_bad_negs: tf.Tensor,
+        mask: Optional[tf.Tensor],
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
         """Define similarity."""
 
         # calculate similarity with several
@@ -659,7 +659,7 @@ def _train_sim(
         return sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
 
     @staticmethod
-    def _calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
+    def _calc_accuracy(sim_pos: tf.Tensor, sim_neg: tf.Tensor) -> tf.Tensor:
         """Calculate accuracy"""
 
         max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
@@ -669,13 +669,13 @@ def _calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
 
     def _loss_margin(
         self,
-        sim_pos: "tf.Tensor",
-        sim_neg_il: "tf.Tensor",
-        sim_neg_ll: "tf.Tensor",
-        sim_neg_ii: "tf.Tensor",
-        sim_neg_li: "tf.Tensor",
-        mask: Optional["tf.Tensor"],
-    ) -> "tf.Tensor":
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+        mask: Optional[tf.Tensor],
+    ) -> tf.Tensor:
         """Define max margin loss."""
 
         # loss for maximizing similarity with correct action
@@ -716,13 +716,13 @@ def _loss_margin(
 
     def _loss_softmax(
         self,
-        sim_pos: "tf.Tensor",
-        sim_neg_il: "tf.Tensor",
-        sim_neg_ll: "tf.Tensor",
-        sim_neg_ii: "tf.Tensor",
-        sim_neg_li: "tf.Tensor",
-        mask: Optional["tf.Tensor"],
-    ) -> "tf.Tensor":
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+        mask: Optional[tf.Tensor],
+    ) -> tf.Tensor:
         """Define softmax loss."""
 
         logits = tf.concat(
@@ -771,13 +771,13 @@ def _chosen_loss(self) -> Callable:
 
     def call(
         self,
-        inputs_embed: "tf.Tensor",
-        labels_embed: "tf.Tensor",
-        labels: "tf.Tensor",
-        all_labels_embed: "tf.Tensor",
-        all_labels: "tf.Tensor",
-        mask: Optional["tf.Tensor"] = None,
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        inputs_embed: tf.Tensor,
+        labels_embed: tf.Tensor,
+        labels: tf.Tensor,
+        all_labels_embed: tf.Tensor,
+        all_labels: tf.Tensor,
+        mask: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Calculate loss and accuracy."""
 
         (

From 223534b1e1f978785b5196a92839368cc5ff79b4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 22 Jan 2020 14:15:15 +0100
Subject: [PATCH 156/633] small fixes

---
 rasa/utils/tensorflow/tf_layers.py | 21 +++++----------------
 rasa/utils/tensorflow/tf_models.py |  8 +++++---
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index e2f935b5e698..8ee4523606e5 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -345,9 +345,7 @@ def __init__(
         ]
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
-    def call(
-        self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor
-    ) -> tf.Tensor:
+    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
 
         # adding embedding and position encoding.
         x = self._embedding(x)  # (batch_size, seq_len, d_model)
@@ -452,10 +450,7 @@ def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
         return pred_ids * mask
 
     def loss(
-        self,
-        logits: tf.Tensor,
-        tag_indices: tf.Tensor,
-        sequence_lengths: tf.Tensor,
+        self, logits: tf.Tensor, tag_indices: tf.Tensor, sequence_lengths: tf.Tensor,
     ) -> tf.Tensor:
         log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
             logits, tag_indices, sequence_lengths, self.transition_params
@@ -528,9 +523,7 @@ def body(i, out):
         )[1]
 
     @staticmethod
-    def _sample_idxs(
-        batch_size: tf.Tensor, x: tf.Tensor, idxs: tf.Tensor
-    ) -> tf.Tensor:
+    def _sample_idxs(batch_size: tf.Tensor, x: tf.Tensor, idxs: tf.Tensor) -> tf.Tensor:
         """Sample negative examples for given indices"""
 
         tiled = tf.tile(tf.expand_dims(x, 0), (batch_size, 1, 1))
@@ -585,9 +578,7 @@ def _sample_negatives(
         labels: tf.Tensor,
         all_labels_embed: tf.Tensor,
         all_labels: tf.Tensor,
-    ) -> Tuple[
-        tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor
-    ]:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
         """Sample negative examples."""
 
         pos_inputs_embed = tf.expand_dims(inputs_embed, -2)
@@ -609,9 +600,7 @@ def _sample_negatives(
         )
 
     @staticmethod
-    def sim(
-        a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None
-    ) -> tf.Tensor:
+    def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
         """Calculate similarity between given tensors."""
 
         sim = tf.reduce_sum(a * b, -1)
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index babc777d7da5..2aaca0637f30 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -50,7 +50,6 @@ def fit(
             )
 
         disable = silent or is_logging_disabled()
-        pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
         tf_batch_size = tf.ones((), tf.int32)
 
@@ -71,7 +70,7 @@ def evaluation_dataset_function(_batch_size):
             tf_train_dataset_function = tf.function(func=train_dataset_function)
             tf_train_on_batch_function = tf.function(
                 self.train_on_batch,
-                input_signature=[tf_train_dataset_function(1).element_spec],
+                input_signature=[tf_train_dataset_function(tf_batch_size).element_spec],
             )
 
         if evaluate_on_num_examples > 0:
@@ -84,12 +83,15 @@ def evaluation_dataset_function(_batch_size):
                 )
                 tf_evaluation_on_batch_function = tf.function(
                     self.evaluate_on_batch,
-                    input_signature=[tf_evaluation_dataset_function(1).element_spec],
+                    input_signature=[
+                        tf_evaluation_dataset_function(tf_batch_size).element_spec
+                    ],
                 )
         else:
             tf_evaluation_dataset_function = None
             tf_evaluation_on_batch_function = None
 
+        pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
         for ep in pbar:
             ep_batch_size = tf_batch_size * self.linearly_increasing_batch_size(
                 ep, batch_size, epochs

From 246c78e90922019b781305b74e5815905182eaf1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 14:23:47 +0100
Subject: [PATCH 157/633] update train_metrics

---
 .../embedding_intent_classifier.py            | 37 ++++++++---------
 rasa/utils/tensorflow/tf_models.py            | 41 +++++++------------
 2 files changed, 31 insertions(+), 47 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 32466e78b887..8cdd1fecbf43 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -163,7 +163,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+        EVAL_NUM_EXAMPLES: 10,  # large values may hurt performance
         # model config
         # if true intent classification is trained and intent predicted
         INTENT_CLASSIFICATION: True,
@@ -889,7 +889,7 @@ def __init__(
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
     ) -> None:
-        super(DIET, self).__init__(name="DIET")
+        super(DIET, self).__init__()
 
         # data
         self.data_signature = data_signature
@@ -912,7 +912,14 @@ def __init__(
         self.entity_f1 = tfa.metrics.F1Score(
             num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
             average="micro",
+            name="e_f1",
         )
+        self.intent_acc = tf.keras.metrics.Mean(name="i_acc")
+        self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
+        self.total_loss = tf.keras.metrics.Mean(name="t_loss")
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
 
         # persist
         self.all_labels_embed = None
@@ -969,7 +976,6 @@ def _prepare_layers(self) -> None:
         self._embed = {}
 
         self.train_metrics = {"t_loss": tf.keras.metrics.Mean(name="t_loss")}
-        self.eval_metrics = {"val_t_loss": tf.keras.metrics.Mean(name="val_t_loss")}
 
         if self.config[MASKED_LM]:
             self._input_mask = tf_layers.InputMask()
@@ -996,8 +1002,6 @@ def _prepare_layers(self) -> None:
             )
             self.train_metrics["m_loss"] = tf.keras.metrics.Mean(name="m_loss")
             self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
-            self.eval_metrics["val_m_loss"] = tf.keras.metrics.Mean(name="val_m_loss")
-            self.eval_metrics["val_m_acc"] = tf.keras.metrics.Mean(name="val_m_acc")
         else:
             self._input_mask = None
             self._loss_mask = None
@@ -1026,8 +1030,6 @@ def _prepare_layers(self) -> None:
             )
             self.train_metrics["i_loss"] = tf.keras.metrics.Mean(name="i_loss")
             self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
-            self.eval_metrics["val_i_loss"] = tf.keras.metrics.Mean(name="val_i_loss")
-            self.eval_metrics["val_i_acc"] = tf.keras.metrics.Mean(name="val_i_acc")
         else:
             self._loss_label = None
 
@@ -1039,8 +1041,6 @@ def _prepare_layers(self) -> None:
             self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
             self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
             self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
-            self.eval_metrics["val_e_loss"] = tf.keras.metrics.Mean(name="val_e_loss")
-            self.eval_metrics["val_e_f1"] = tf.keras.metrics.Mean(name="val_e_f1")
 
     def set_training_phase(self, training: bool) -> None:
         if training:
@@ -1175,7 +1175,7 @@ def _entity_loss(
 
     def _train_losses_scores(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> Tuple[Dict[Text, float], Dict[Text, float]]:
+    ) -> None:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
@@ -1185,15 +1185,12 @@ def _train_losses_scores(
             tf_batch_data["text_features"], mask_text, "text", self.config[MASKED_LM]
         )
 
-        losses = {}
-        scores = {}
-
         if self.config[MASKED_LM]:
             loss, acc = self._mask_loss(
                 text_transformed, text_in, lm_mask_bool_text, "text"
             )
-            losses["m_loss"] = loss
-            scores["m_acc"] = acc
+            self.train_metrics["m_loss"].update_state(loss)
+            self.train_metrics["m_acc"].update_state(acc)
 
         if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
@@ -1207,8 +1204,8 @@ def _train_losses_scores(
                 tf_batch_data["label_features"], tf_batch_data["label_mask"][0], "label"
             )
             loss, acc = self._intent_loss(cls, label)
-            losses["i_loss"] = loss
-            scores["i_acc"] = acc
+            self.train_metrics["i_loss"].update_state(loss)
+            self.train_metrics["i_acc"].update_state(acc)
 
         if self.config[ENTITY_RECOGNITION]:
             tags = tf_batch_data["tag_ids"][0]
@@ -1216,10 +1213,8 @@ def _train_losses_scores(
             loss, f1 = self._entity_loss(
                 text_transformed, tags, mask_text, sequence_lengths
             )
-            losses["e_loss"] = loss
-            scores["e_f1"] = f1
-
-        return losses, scores
+            self.train_metrics["e_loss"].update_state(loss)
+            self.train_metrics["e_f1"].update_state(f1)
 
     def build_for_predict(self, model_data: RasaModelData) -> None:
         self.batch_tuple_sizes = model_data.batch_tuple_sizes()
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index babc777d7da5..97956e5e09ec 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -14,15 +14,9 @@
 class RasaModel(tf.keras.models.Model):
     """Completely override all public methods of keras Model."""
 
-    @staticmethod
-    def _update_postfix_dict(
-        postfix_dict: Dict[Text, Text], metrics, prefix: Text = ""
-    ) -> Dict[Text, Text]:
-        for name, value in metrics.loss.items():
-            postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
-        for name, value in metrics.score.items():
-            postfix_dict[f"{prefix}{name}"] = f"{value:.3f}"
-        return postfix_dict
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.train_metrics = {}
 
     def fit(
         self,
@@ -33,7 +27,7 @@ def fit(
         evaluate_every_num_epochs: int,
         batch_strategy: Text,
         silent: bool = False,
-        eager: bool = False,
+        eager: bool = True,
         random_seed: Optional[int] = None,
         **kwargs,
     ) -> None:
@@ -106,7 +100,7 @@ def evaluation_dataset_function(_batch_size):
 
             # Get the metric results
             postfix_dict = {
-                k: v.result().numpy() for k, v in self.train_metrics.items()
+                k: f"{v.result().numpy():3f}" for k, v in self.train_metrics.items()
             }
 
             if evaluate_on_num_examples > 0:
@@ -116,7 +110,7 @@ def evaluation_dataset_function(_batch_size):
                     or (ep + 1) == epochs
                 ):
                     # Reset the metrics
-                    for metric in self.eval_metrics.values():
+                    for metric in self.train_metrics.values():
                         metric.reset_states()
 
                     # Eval on batches
@@ -126,7 +120,10 @@ def evaluation_dataset_function(_batch_size):
 
                 # Get the metric results
                 postfix_dict.update(
-                    {k: v.result().numpy() for k, v in self.eval_metrics.items()}
+                    {
+                        f"val_{k}": f"{v.result().numpy():3f}"
+                        for k, v in self.train_metrics.items()
+                    }
                 )
 
             pbar.set_postfix(postfix_dict)
@@ -149,33 +146,25 @@ def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
         with tf.GradientTape() as tape:
-            losses, scores = self._train_losses_scores(batch_in)
+            self._train_losses_scores(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
-            pred_loss = tf.math.add_n(list(losses.values()))
+            pred_loss = 0  # tf.math.add_n(list(losses.values()))
             total_loss = pred_loss + regularization_loss
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
         self.train_metrics["t_loss"].update_state(total_loss)
-        for k, v in losses.items():
-            self.train_metrics[k].update_state(v)
-        for k, v in scores.items():
-            self.train_metrics[k].update_state(v)
 
     def evaluate_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
-        losses, scores = self._train_losses_scores(batch_in)
+        self._train_losses_scores(batch_in)
         regularization_loss = tf.math.add_n(self.losses)
-        pred_loss = tf.math.add_n(list(losses.values()))
+        pred_loss = 0  # tf.math.add_n(list(losses.values()))
         total_loss = pred_loss + regularization_loss
 
-        self.eval_metrics["val_t_loss"].update_state(total_loss)
-        for k, v in losses.items():
-            self.eval_metrics[f"val_{k}"].update_state(v)
-        for k, v in scores.items():
-            self.eval_metrics[f"val_{k}"].update_state(v)
+        self.train_metrics["t_loss"].update_state(total_loss)
 
     def test_on_batch(self, **kwargs) -> None:
         raise NotImplementedError

From fb7a68fc3087321c0fc7dc5948d70aa3beecf8fa Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 15:15:49 +0100
Subject: [PATCH 158/633] make use of self.metrics

---
 .../embedding_intent_classifier.py            | 37 ++++++++-----------
 rasa/utils/tensorflow/tf_models.py            | 25 ++++++++-----
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8cdd1fecbf43..566e490ea303 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -163,7 +163,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 10,  # large values may hurt performance
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
         # model config
         # if true intent classification is trained and intent predicted
         INTENT_CLASSIFICATION: True,
@@ -909,17 +909,12 @@ def __init__(
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(config[LEARNING_RATE])
-        self.entity_f1 = tfa.metrics.F1Score(
-            num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
-            average="micro",
-            name="e_f1",
-        )
         self.intent_acc = tf.keras.metrics.Mean(name="i_acc")
         self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
-        self.total_loss = tf.keras.metrics.Mean(name="t_loss")
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
         self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
+        self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
 
         # persist
         self.all_labels_embed = None
@@ -975,8 +970,6 @@ def _prepare_layers(self) -> None:
 
         self._embed = {}
 
-        self.train_metrics = {"t_loss": tf.keras.metrics.Mean(name="t_loss")}
-
         if self.config[MASKED_LM]:
             self._input_mask = tf_layers.InputMask()
             self._embed["text_mask"] = tf_layers.Embed(
@@ -1000,8 +993,6 @@ def _prepare_layers(self) -> None:
                 self.config[C_EMB],
                 self.config[SCALE_LOSS],
             )
-            self.train_metrics["m_loss"] = tf.keras.metrics.Mean(name="m_loss")
-            self.train_metrics["m_acc"] = tf.keras.metrics.Mean(name="m_acc")
         else:
             self._input_mask = None
             self._loss_mask = None
@@ -1028,8 +1019,6 @@ def _prepare_layers(self) -> None:
                 self.config[C_EMB],
                 self.config[SCALE_LOSS],
             )
-            self.train_metrics["i_loss"] = tf.keras.metrics.Mean(name="i_loss")
-            self.train_metrics["i_acc"] = tf.keras.metrics.Mean(name="i_acc")
         else:
             self._loss_label = None
 
@@ -1039,8 +1028,12 @@ def _prepare_layers(self) -> None:
                 self._num_tags, self.config[C2], "logits"
             )
             self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
-            self.train_metrics["e_loss"] = tf.keras.metrics.Mean(name="e_loss")
-            self.train_metrics["e_f1"] = tf.keras.metrics.Mean(name="e_f1")
+
+            self.entity_f1_score = tfa.metrics.F1Score(
+                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+                average="micro",
+                name="entity_f1_score",
+            )
 
     def set_training_phase(self, training: bool) -> None:
         if training:
@@ -1169,7 +1162,7 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        f1 = self.entity_f1(c_masked_1, pred_ids_masked_1)
+        f1 = self.entity_f1_score(c_masked_1, pred_ids_masked_1)
 
         return loss, f1
 
@@ -1189,8 +1182,8 @@ def _train_losses_scores(
             loss, acc = self._mask_loss(
                 text_transformed, text_in, lm_mask_bool_text, "text"
             )
-            self.train_metrics["m_loss"].update_state(loss)
-            self.train_metrics["m_acc"].update_state(acc)
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
 
         if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
@@ -1204,8 +1197,8 @@ def _train_losses_scores(
                 tf_batch_data["label_features"], tf_batch_data["label_mask"][0], "label"
             )
             loss, acc = self._intent_loss(cls, label)
-            self.train_metrics["i_loss"].update_state(loss)
-            self.train_metrics["i_acc"].update_state(acc)
+            self.intent_loss.update_state(loss)
+            self.intent_acc.update_state(acc)
 
         if self.config[ENTITY_RECOGNITION]:
             tags = tf_batch_data["tag_ids"][0]
@@ -1213,8 +1206,8 @@ def _train_losses_scores(
             loss, f1 = self._entity_loss(
                 text_transformed, tags, mask_text, sequence_lengths
             )
-            self.train_metrics["e_loss"].update_state(loss)
-            self.train_metrics["e_f1"].update_state(f1)
+            self.entity_loss.update_state(loss)
+            self.entity_f1.update_state(f1)
 
     def build_for_predict(self, model_data: RasaModelData) -> None:
         self.batch_tuple_sizes = model_data.batch_tuple_sizes()
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 97956e5e09ec..c56b36b08ea4 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -16,7 +16,8 @@ class RasaModel(tf.keras.models.Model):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.train_metrics = {}
+
+        self.total_loss = tf.keras.metrics.Mean(name="t_loss")
 
     def fit(
         self,
@@ -90,7 +91,7 @@ def evaluation_dataset_function(_batch_size):
             )
 
             # Reset the metrics
-            for metric in self.train_metrics.values():
+            for metric in self.metrics:
                 metric.reset_states()
 
             # Train on batches
@@ -100,7 +101,7 @@ def evaluation_dataset_function(_batch_size):
 
             # Get the metric results
             postfix_dict = {
-                k: f"{v.result().numpy():3f}" for k, v in self.train_metrics.items()
+                metric.name: f"{metric.result().numpy():.3f}" for metric in self.metrics
             }
 
             if evaluate_on_num_examples > 0:
@@ -110,7 +111,7 @@ def evaluation_dataset_function(_batch_size):
                     or (ep + 1) == epochs
                 ):
                     # Reset the metrics
-                    for metric in self.train_metrics.values():
+                    for metric in self.metrics:
                         metric.reset_states()
 
                     # Eval on batches
@@ -121,8 +122,8 @@ def evaluation_dataset_function(_batch_size):
                 # Get the metric results
                 postfix_dict.update(
                     {
-                        f"val_{k}": f"{v.result().numpy():3f}"
-                        for k, v in self.train_metrics.items()
+                        f"val_{metric.name}": f"{metric.result().numpy():.3f}"
+                        for metric in self.metrics
                     }
                 )
 
@@ -148,23 +149,27 @@ def train_on_batch(
         with tf.GradientTape() as tape:
             self._train_losses_scores(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
-            pred_loss = 0  # tf.math.add_n(list(losses.values()))
+            pred_loss = tf.math.add_n(
+                list([m.result() for m in self.metrics if "loss" in m.name])
+            )
             total_loss = pred_loss + regularization_loss
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
-        self.train_metrics["t_loss"].update_state(total_loss)
+        self.total_loss.update_state(total_loss)
 
     def evaluate_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
         self._train_losses_scores(batch_in)
         regularization_loss = tf.math.add_n(self.losses)
-        pred_loss = 0  # tf.math.add_n(list(losses.values()))
+        pred_loss = tf.math.add_n(
+            list([m.result() for m in self.metrics if "loss" in m.name])
+        )
         total_loss = pred_loss + regularization_loss
 
-        self.train_metrics["t_loss"].update_state(total_loss)
+        self.total_loss.update_state(total_loss)
 
     def test_on_batch(self, **kwargs) -> None:
         raise NotImplementedError

From 5395acecc59be175df366fac2f9ca15ed8f4720a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 15:52:51 +0100
Subject: [PATCH 159/633] refactor RasaModel

---
 .../embedding_intent_classifier.py            | 150 +++++++--------
 rasa/utils/tensorflow/tf_models.py            | 173 +++++++++++-------
 2 files changed, 185 insertions(+), 138 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 566e490ea303..6d96c9414a14 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -848,40 +848,6 @@ def load(
 
 
 class DIET(tf_models.RasaModel):
-    @staticmethod
-    def _create_sparse_dense_layer(
-        data_signature: List[FeatureSignature],
-        name: Text,
-        reg_lambda: float,
-        dense_dim: int,
-    ) -> Optional[tf_layers.DenseForSparse]:
-
-        sparse = False
-        for is_sparse, shape in data_signature:
-            if is_sparse:
-                sparse = is_sparse
-            else:
-                # if dense features are present
-                # use the feature dimension of the dense features
-                dense_dim = shape[-1]
-
-        if sparse:
-            return tf_layers.DenseForSparse(
-                units=dense_dim, reg_lambda=reg_lambda, name=name
-            )
-
-    @staticmethod
-    def _input_dim(data_signature: List[FeatureSignature], dense_dim: int) -> int:
-
-        for is_sparse, shape in data_signature:
-            if not is_sparse:
-                # if dense features are present
-                # use the feature dimension of the dense features
-                dense_dim = shape[-1]
-                break
-
-        return dense_dim * len(data_signature)
-
     def __init__(
         self,
         data_signature: Dict[Text, List[FeatureSignature]],
@@ -916,11 +882,19 @@ def __init__(
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
 
+        if self.config[INTENT_CLASSIFICATION]:
+            self.metrics_to_log += ["i_acc", "i_loss"]
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_acc", "m_loss"]
+        if self.config[ENTITY_RECOGNITION]:
+            self.metrics_to_log += ["e_loss", "e_f1"]
+
         # persist
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
     def _prepare_layers(self) -> None:
+        self._embed = {}
         self._sparse_dropout = tf_layers.SparseDropout(rate=self.config[DROPRATE])
 
         self._sparse_to_dense = {
@@ -968,23 +942,38 @@ def _prepare_layers(self) -> None:
         else:
             self._transformer = lambda x, mask, training: x
 
-        self._embed = {}
+        self._prepare_mask_lm_layers()
+        self._prepare_intent_classification_layers()
+        self._prepare_entity_recognition_layers()
 
-        if self.config[MASKED_LM]:
-            self._input_mask = tf_layers.InputMask()
-            self._embed["text_mask"] = tf_layers.Embed(
+    def _prepare_entity_recognition_layers(self):
+        self._crf = None
+        if self.config[ENTITY_RECOGNITION]:
+            self._embed["logits"] = tf_layers.Embed(
+                self._num_tags, self.config[C2], "logits"
+            )
+            self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
+
+            self.metric_f1_score = tfa.metrics.F1Score(
+                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+                average="micro",
+            )
+
+    def _prepare_intent_classification_layers(self):
+        if self.config[INTENT_CLASSIFICATION]:
+            self._embed["text"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "text_mask",
+                "text",
                 self.config[SIMILARITY_TYPE],
             )
-            self._embed["text_token"] = tf_layers.Embed(
+            self._embed["label"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "text_token",
+                "label",
                 self.config[SIMILARITY_TYPE],
             )
-            self._loss_mask = tf_layers.DotProductLoss(
+            self._loss_label = tf_layers.DotProductLoss(
                 self.config[NUM_NEG],
                 self.config[LOSS_TYPE],
                 self.config[MU_POS],
@@ -994,23 +983,24 @@ def _prepare_layers(self) -> None:
                 self.config[SCALE_LOSS],
             )
         else:
-            self._input_mask = None
-            self._loss_mask = None
+            self._loss_label = None
 
-        if self.config[INTENT_CLASSIFICATION]:
-            self._embed["text"] = tf_layers.Embed(
+    def _prepare_mask_lm_layers(self):
+        if self.config[MASKED_LM]:
+            self._input_mask = tf_layers.InputMask()
+            self._embed["text_mask"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "text",
+                "text_mask",
                 self.config[SIMILARITY_TYPE],
             )
-            self._embed["label"] = tf_layers.Embed(
+            self._embed["text_token"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "label",
+                "text_token",
                 self.config[SIMILARITY_TYPE],
             )
-            self._loss_label = tf_layers.DotProductLoss(
+            self._loss_mask = tf_layers.DotProductLoss(
                 self.config[NUM_NEG],
                 self.config[LOSS_TYPE],
                 self.config[MU_POS],
@@ -1020,20 +1010,8 @@ def _prepare_layers(self) -> None:
                 self.config[SCALE_LOSS],
             )
         else:
-            self._loss_label = None
-
-        self._crf = None
-        if self.config[ENTITY_RECOGNITION]:
-            self._embed["logits"] = tf_layers.Embed(
-                self._num_tags, self.config[C2], "logits"
-            )
-            self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
-
-            self.entity_f1_score = tfa.metrics.F1Score(
-                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
-                average="micro",
-                name="entity_f1_score",
-            )
+            self._input_mask = None
+            self._loss_mask = None
 
     def set_training_phase(self, training: bool) -> None:
         if training:
@@ -1141,15 +1119,7 @@ def _entity_loss(
         c = tf.cast(c[:, :, 0], tf.int32)
         logits = self._embed["logits"](a)
 
-        # tensor shapes
-        # a: tensor(batch-size, max-seq-len, dim)
-        # sequence_lengths: tensor(batch-size)
-        # c: (batch-size, max-seq-len)
-
-        # CRF Loss
         loss = self._crf.loss(logits, c, sequence_lengths)
-
-        # CRF preds
         pred_ids = self._crf(logits, sequence_lengths)
 
         # TODO check that f1 calculation is correct
@@ -1162,7 +1132,7 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        f1 = self.entity_f1_score(c_masked_1, pred_ids_masked_1)
+        f1 = self.metric_f1_score(c_masked_1, pred_ids_masked_1)
 
         return loss, f1
 
@@ -1252,3 +1222,37 @@ def predict(
             out["e_ids"] = pred_ids
 
         return out
+
+    @staticmethod
+    def _create_sparse_dense_layer(
+        data_signature: List[FeatureSignature],
+        name: Text,
+        reg_lambda: float,
+        dense_dim: int,
+    ) -> Optional[tf_layers.DenseForSparse]:
+
+        sparse = False
+        for is_sparse, shape in data_signature:
+            if is_sparse:
+                sparse = is_sparse
+            else:
+                # if dense features are present
+                # use the feature dimension of the dense features
+                dense_dim = shape[-1]
+
+        if sparse:
+            return tf_layers.DenseForSparse(
+                units=dense_dim, reg_lambda=reg_lambda, name=name
+            )
+
+    @staticmethod
+    def _input_dim(data_signature: List[FeatureSignature], dense_dim: int) -> int:
+
+        for is_sparse, shape in data_signature:
+            if not is_sparse:
+                # if dense features are present
+                # use the feature dimension of the dense features
+                dense_dim = shape[-1]
+                break
+
+        return dense_dim * len(data_signature)
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index c56b36b08ea4..2e7e3fdae569 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -2,7 +2,7 @@
 import numpy as np
 import logging
 from collections import defaultdict
-from typing import List, Text, Dict, Tuple, Union, Optional
+from typing import List, Text, Dict, Tuple, Union, Optional, Callable
 from tqdm import tqdm
 from rasa.utils.common import is_logging_disabled
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
@@ -18,6 +18,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
+        self.metrics_to_log = ["t_loss"]
 
     def fit(
         self,
@@ -34,6 +35,7 @@ def fit(
     ) -> None:
         """Train tf graph"""
 
+        evaluation_model_data = None
         if evaluate_on_num_examples > 0:
             logger.info(
                 f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
@@ -47,28 +49,77 @@ def fit(
         disable = silent or is_logging_disabled()
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
-        tf_batch_size = tf.ones((), tf.int32)
+        (
+            tf_train_dataset_function,
+            tf_train_on_batch_function,
+        ) = self._get_tf_train_functions(eager, model_data, batch_strategy)
 
+        (
+            tf_evaluation_dataset_function,
+            tf_evaluation_on_batch_function,
+        ) = self._get_tf_evaluation_functions(
+            eager, evaluate_on_num_examples, evaluation_model_data, batch_strategy
+        )
+
+        for ep in pbar:
+            ep_batch_size = self.linearly_increasing_batch_size(ep, batch_size, epochs)
+
+            self._reset_metrics()
+
+            # Train on batches
+            self.set_training_phase(True)
+            for batch_in in tf_train_dataset_function(ep_batch_size):
+                tf_train_on_batch_function(batch_in)
+
+            postfix_dict = self._get_metric_results()
+
+            if evaluate_on_num_examples > 0:
+                if self._should_evaluate(evaluate_every_num_epochs, epochs, ep):
+                    self._reset_metrics()
+
+                    # Eval on batches
+                    self.set_training_phase(False)
+                    for batch_in in tf_evaluation_dataset_function(ep_batch_size):
+                        tf_evaluation_on_batch_function(batch_in)
+
+                # Get the metric results
+                postfix_dict.update(self._get_metric_results(prefix="val_"))
+
+            pbar.set_postfix(postfix_dict)
+
+        if not disable:
+            logger.info("Finished training.")
+
+    def _get_tf_train_functions(
+        self, eager: bool, model_data: RasaModelData, batch_strategy: Text
+    ) -> Tuple[Callable, Callable]:
         def train_dataset_function(_batch_size):
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
 
-        def evaluation_dataset_function(_batch_size):
-            return evaluation_model_data.as_tf_dataset(
-                _batch_size, batch_strategy, shuffle=False
-            )
-
         if eager:
-            # allows increasing batch size
             tf_train_dataset_function = train_dataset_function
             tf_train_on_batch_function = self.train_on_batch
         else:
-            # allows increasing batch size
             tf_train_dataset_function = tf.function(func=train_dataset_function)
             tf_train_on_batch_function = tf.function(
                 self.train_on_batch,
                 input_signature=[tf_train_dataset_function(1).element_spec],
             )
 
+        return tf_train_dataset_function, tf_train_on_batch_function
+
+    def _get_tf_evaluation_functions(
+        self,
+        eager: bool,
+        evaluate_on_num_examples: int,
+        evaluation_model_data: RasaModelData,
+        batch_strategy: Text,
+    ) -> Tuple[Callable, Callable]:
+        def evaluation_dataset_function(_batch_size):
+            return evaluation_model_data.as_tf_dataset(
+                _batch_size, batch_strategy, shuffle=False
+            )
+
         if evaluate_on_num_examples > 0:
             if eager:
                 tf_evaluation_dataset_function = evaluation_dataset_function
@@ -85,63 +136,22 @@ def evaluation_dataset_function(_batch_size):
             tf_evaluation_dataset_function = None
             tf_evaluation_on_batch_function = None
 
-        for ep in pbar:
-            ep_batch_size = tf_batch_size * self.linearly_increasing_batch_size(
-                ep, batch_size, epochs
-            )
+        return tf_evaluation_dataset_function, tf_evaluation_on_batch_function
 
-            # Reset the metrics
-            for metric in self.metrics:
-                metric.reset_states()
+    def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]:
+        prefix = prefix or ""
 
-            # Train on batches
-            self.set_training_phase(True)
-            for batch_in in tf_train_dataset_function(ep_batch_size):
-                tf_train_on_batch_function(batch_in)
-
-            # Get the metric results
-            postfix_dict = {
-                metric.name: f"{metric.result().numpy():.3f}" for metric in self.metrics
-            }
-
-            if evaluate_on_num_examples > 0:
-                if (
-                    ep == 0
-                    or (ep + 1) % evaluate_every_num_epochs == 0
-                    or (ep + 1) == epochs
-                ):
-                    # Reset the metrics
-                    for metric in self.metrics:
-                        metric.reset_states()
-
-                    # Eval on batches
-                    self.set_training_phase(False)
-                    for batch_in in tf_evaluation_dataset_function(ep_batch_size):
-                        tf_evaluation_on_batch_function(batch_in)
-
-                # Get the metric results
-                postfix_dict.update(
-                    {
-                        f"val_{metric.name}": f"{metric.result().numpy():.3f}"
-                        for metric in self.metrics
-                    }
-                )
-
-            pbar.set_postfix(postfix_dict)
-
-        if not disable:
-            logger.info("Finished training.")
+        # Get the metric results
+        return {
+            metric.name: f"{prefix}{metric.result().numpy():.3f}"
+            for metric in self.metrics
+            if metric.name in self.metrics_to_log
+        }
 
-    def compile(self, **kwargs) -> None:
-        raise NotImplementedError
-
-    def evaluate(self, **kwargs) -> None:
-        pass
-
-    def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
-    ) -> Dict[Text, tf.Tensor]:
-        pass
+    def _reset_metrics(self) -> None:
+        # Reset the metrics
+        for metric in self.metrics:
+            metric.reset_states()
 
     def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
@@ -150,7 +160,13 @@ def train_on_batch(
             self._train_losses_scores(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
             pred_loss = tf.math.add_n(
-                list([m.result() for m in self.metrics if "loss" in m.name])
+                list(
+                    [
+                        m.result()
+                        for m in self.metrics
+                        if "loss" in m.name.lower() and m.name in self.metrics_to_log
+                    ]
+                )
             )
             total_loss = pred_loss + regularization_loss
 
@@ -165,12 +181,29 @@ def evaluate_on_batch(
         self._train_losses_scores(batch_in)
         regularization_loss = tf.math.add_n(self.losses)
         pred_loss = tf.math.add_n(
-            list([m.result() for m in self.metrics if "loss" in m.name])
+            list(
+                [
+                    m.result()
+                    for m in self.metrics
+                    if "loss" in m.name.lower() and m.name in self.metrics_to_log
+                ]
+            )
         )
         total_loss = pred_loss + regularization_loss
 
         self.total_loss.update_state(total_loss)
 
+    def compile(self, **kwargs) -> None:
+        raise NotImplementedError
+
+    def evaluate(self, **kwargs) -> None:
+        pass
+
+    def predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+    ) -> Dict[Text, tf.Tensor]:
+        pass
+
     def test_on_batch(self, **kwargs) -> None:
         raise NotImplementedError
 
@@ -186,6 +219,16 @@ def evaluate_generator(self, **kwargs) -> None:
     def predict_generator(self, **kwargs) -> None:
         raise NotImplementedError
 
+    @staticmethod
+    def _should_evaluate(
+        evaluate_every_num_epochs: int, epochs: int, current_epoch: int
+    ) -> bool:
+        return (
+            current_epoch == 0
+            or (current_epoch + 1) % evaluate_every_num_epochs == 0
+            or (current_epoch + 1) == epochs
+        )
+
     @staticmethod
     def batch_to_model_data_format(
         batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],

From 75d229b872078abe17dfa1c956baf4227f3e1291 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 15:56:07 +0100
Subject: [PATCH 160/633] change order of methods

---
 .../embedding_intent_classifier.py            | 62 +++++++++----------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6d96c9414a14..682faab6d50f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -946,34 +946,22 @@ def _prepare_layers(self) -> None:
         self._prepare_intent_classification_layers()
         self._prepare_entity_recognition_layers()
 
-    def _prepare_entity_recognition_layers(self):
-        self._crf = None
-        if self.config[ENTITY_RECOGNITION]:
-            self._embed["logits"] = tf_layers.Embed(
-                self._num_tags, self.config[C2], "logits"
-            )
-            self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
-
-            self.metric_f1_score = tfa.metrics.F1Score(
-                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
-                average="micro",
-            )
-
-    def _prepare_intent_classification_layers(self):
-        if self.config[INTENT_CLASSIFICATION]:
-            self._embed["text"] = tf_layers.Embed(
+    def _prepare_mask_lm_layers(self):
+        if self.config[MASKED_LM]:
+            self._input_mask = tf_layers.InputMask()
+            self._embed["text_mask"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "text",
+                "text_mask",
                 self.config[SIMILARITY_TYPE],
             )
-            self._embed["label"] = tf_layers.Embed(
+            self._embed["text_token"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "label",
+                "text_token",
                 self.config[SIMILARITY_TYPE],
             )
-            self._loss_label = tf_layers.DotProductLoss(
+            self._loss_mask = tf_layers.DotProductLoss(
                 self.config[NUM_NEG],
                 self.config[LOSS_TYPE],
                 self.config[MU_POS],
@@ -983,24 +971,24 @@ def _prepare_intent_classification_layers(self):
                 self.config[SCALE_LOSS],
             )
         else:
-            self._loss_label = None
+            self._input_mask = None
+            self._loss_mask = None
 
-    def _prepare_mask_lm_layers(self):
-        if self.config[MASKED_LM]:
-            self._input_mask = tf_layers.InputMask()
-            self._embed["text_mask"] = tf_layers.Embed(
+    def _prepare_intent_classification_layers(self):
+        if self.config[INTENT_CLASSIFICATION]:
+            self._embed["text"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "text_mask",
+                "text",
                 self.config[SIMILARITY_TYPE],
             )
-            self._embed["text_token"] = tf_layers.Embed(
+            self._embed["label"] = tf_layers.Embed(
                 self.config[EMBED_DIM],
                 self.config[C2],
-                "text_token",
+                "label",
                 self.config[SIMILARITY_TYPE],
             )
-            self._loss_mask = tf_layers.DotProductLoss(
+            self._loss_label = tf_layers.DotProductLoss(
                 self.config[NUM_NEG],
                 self.config[LOSS_TYPE],
                 self.config[MU_POS],
@@ -1010,8 +998,20 @@ def _prepare_mask_lm_layers(self):
                 self.config[SCALE_LOSS],
             )
         else:
-            self._input_mask = None
-            self._loss_mask = None
+            self._loss_label = None
+
+    def _prepare_entity_recognition_layers(self):
+        self._crf = None
+        if self.config[ENTITY_RECOGNITION]:
+            self._embed["logits"] = tf_layers.Embed(
+                self._num_tags, self.config[C2], "logits"
+            )
+            self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
+
+            self.metric_f1_score = tfa.metrics.F1Score(
+                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+                average="micro",
+            )
 
     def set_training_phase(self, training: bool) -> None:
         if training:

From 9e4569d68683fb15638e54a6fec544369489aa92 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 16:04:23 +0100
Subject: [PATCH 161/633] update logging of metrics

---
 rasa/utils/tensorflow/tf_models.py | 33 ++++++++++++------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 2e7e3fdae569..7517e57676aa 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -29,7 +29,7 @@ def fit(
         evaluate_every_num_epochs: int,
         batch_strategy: Text,
         silent: bool = False,
-        eager: bool = True,
+        eager: bool = False,
         random_seed: Optional[int] = None,
         **kwargs,
     ) -> None:
@@ -143,7 +143,7 @@ def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]
 
         # Get the metric results
         return {
-            metric.name: f"{prefix}{metric.result().numpy():.3f}"
+            f"{prefix}{metric.name}": f"{metric.result().numpy():.3f}"
             for metric in self.metrics
             if metric.name in self.metrics_to_log
         }
@@ -153,21 +153,22 @@ def _reset_metrics(self) -> None:
         for metric in self.metrics:
             metric.reset_states()
 
+    def _get_losses_from_metrics(self) -> List[tf.Tensor]:
+        return list(
+            [
+                m.result()
+                for m in self.metrics
+                if "loss" in m.name.lower() and m.name in self.metrics_to_log
+            ]
+        )
+
     def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
         with tf.GradientTape() as tape:
             self._train_losses_scores(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
-            pred_loss = tf.math.add_n(
-                list(
-                    [
-                        m.result()
-                        for m in self.metrics
-                        if "loss" in m.name.lower() and m.name in self.metrics_to_log
-                    ]
-                )
-            )
+            pred_loss = tf.math.add_n(self._get_losses_from_metrics())
             total_loss = pred_loss + regularization_loss
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
@@ -180,15 +181,7 @@ def evaluate_on_batch(
     ) -> None:
         self._train_losses_scores(batch_in)
         regularization_loss = tf.math.add_n(self.losses)
-        pred_loss = tf.math.add_n(
-            list(
-                [
-                    m.result()
-                    for m in self.metrics
-                    if "loss" in m.name.lower() and m.name in self.metrics_to_log
-                ]
-            )
-        )
+        pred_loss = tf.math.add_n(self._get_losses_from_metrics())
         total_loss = pred_loss + regularization_loss
 
         self.total_loss.update_state(total_loss)

From bf2fe8b3efbc0bed4c4451157d337f60f0f753c4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 22 Jan 2020 16:05:46 +0100
Subject: [PATCH 162/633] fix overriding data in rasaModelData

---
 rasa/utils/tensorflow/tf_model_data.py | 131 ++++++++++++-------------
 rasa/utils/tensorflow/tf_models.py     |   8 +-
 tests/utils/test_tf_model_data.py      |  15 +--
 3 files changed, 79 insertions(+), 75 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index 5607b5d81906..1081e4cbe2aa 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -7,6 +7,8 @@
 from collections import defaultdict
 
 
+Data = Optional[Dict[Text, List[np.ndarray]]]
+
 class FeatureSignature(NamedTuple):
     is_sparse: bool
     shape: List[int]
@@ -16,10 +18,12 @@ class RasaModelData:
     def __init__(
         self,
         label_key: Optional[Text] = None,
-        data: Optional[Dict[Text, List[np.ndarray]]] = None,
+        data: Data = None,
     ):
         self.data = data or {}
         self.label_key = label_key or ""
+        # will be updated when features are added
+        self.num_examples = self.get_number_of_examples()
 
     def get(self, key: Text) -> List[np.ndarray]:
         return self.data[key]
@@ -36,6 +40,36 @@ def keys(self):
     def feature_not_exists(self, key: Text) -> bool:
         return key not in self.data or not self.data[key]
 
+    def get_number_of_examples(self) -> int:
+        """Obtain number of examples in session data.
+
+        Raise a ValueError if number of examples differ for different data in
+        session data.
+        """
+        if not self.data:
+            return 0
+
+        example_lengths = [v.shape[0] for values in self.data.values() for v in values]
+
+        # check if number of examples is the same for all values
+        if not all(length == example_lengths[0] for length in example_lengths):
+            raise ValueError(
+                f"Number of examples differs for keys '{self.data.keys()}'. Number of "
+                f"examples should be the same for all data."
+            )
+
+        return example_lengths[0]
+
+    def get_feature_dimension(self, key: Text) -> int:
+        """Get the feature dimension of the given key."""
+
+        number_of_features = 0
+        for data in self.data[key]:
+            if data.size > 0:
+                number_of_features += data[0].shape[-1]
+
+        return number_of_features
+
     def split(
         self, number_of_test_examples: int, random_seed: int
     ) -> Tuple["RasaModelData", "RasaModelData"]:
@@ -79,6 +113,9 @@ def add_features(self, key: Text, features: List[np.ndarray]):
         if not self.data[key]:
             del self.data[key]
 
+        # update number of examples
+        self.num_examples = self.get_number_of_examples()
+
     def add_mask(self, key: Text, from_key: Text):
         """Calculate mask for given key and put it under specified key."""
 
@@ -112,14 +149,13 @@ def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
             for key, values in self.data.items()
         }
 
-    def shuffle(self) -> None:
+    def shuffled_data(self, data: Data) -> Data:
         """Shuffle session data."""
 
-        data_points = self.get_number_of_examples()
-        ids = np.random.permutation(data_points)
-        self.data = self._data_for_ids(ids)
+        ids = np.random.permutation(self.num_examples)
+        return self._data_for_ids(data, ids)
 
-    def balance(self, batch_size: int, shuffle: bool) -> None:
+    def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
         """Mix session data to account for class imbalance.
 
         This batching strategy puts rare classes approximately in every other batch,
@@ -127,10 +163,10 @@ def balance(self, batch_size: int, shuffle: bool) -> None:
         that more populated classes should appear more often.
         """
 
-        if self.label_key not in self.data or len(self.data[self.label_key]) > 1:
+        if self.label_key not in data or len(data[self.label_key]) > 1:
             raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
 
-        label_ids = self._create_label_ids(self.data[self.label_key][0])
+        label_ids = self._create_label_ids(data[self.label_key][0])
 
         unique_label_ids, counts_label_ids = np.unique(
             label_ids, return_counts=True, axis=0
@@ -138,14 +174,13 @@ def balance(self, batch_size: int, shuffle: bool) -> None:
         num_label_ids = len(unique_label_ids)
 
         # need to call every time, so that the data is shuffled inside each class
-        label_data = self._split_by_label_ids(label_ids, unique_label_ids)
+        label_data = self._split_by_label_ids(data, label_ids, unique_label_ids)
 
         data_idx = [0] * num_label_ids
         num_data_cycles = [0] * num_label_ids
         skipped = [False] * num_label_ids
 
         new_data = defaultdict(list)
-        num_examples = self.get_number_of_examples()
 
         while min(num_data_cycles) == 0:
             if shuffle:
@@ -161,7 +196,7 @@ def balance(self, batch_size: int, shuffle: bool) -> None:
                     skipped[index] = False
 
                 index_batch_size = (
-                    int(counts_label_ids[index] / num_examples * batch_size) + 1
+                    int(counts_label_ids[index] / self.num_examples * batch_size) + 1
                 )
 
                 for k, values in label_data[index].items():
@@ -185,61 +220,23 @@ def balance(self, batch_size: int, shuffle: bool) -> None:
             for v in values:
                 final_data[k].append(np.concatenate(np.array(v)))
 
-        self.data = final_data
-
-    def get_number_of_examples(self) -> int:
-        """Obtain number of examples in session data.
-
-        Raise a ValueError if number of examples differ for different data in
-        session data.
-        """
-
-        example_lengths = [v.shape[0] for values in self.data.values() for v in values]
-
-        # check if number of examples is the same for all values
-        if not all(length == example_lengths[0] for length in example_lengths):
-            raise ValueError(
-                f"Number of examples differs for keys '{self.data.keys()}'. Number of "
-                f"examples should be the same for all data."
-            )
-
-        return example_lengths[0]
-
-    def get_feature_dimension(self, key: Text) -> int:
-        """Get the feature dimension of the given key."""
-
-        number_of_features = 0
-        for data in self.data[key]:
-            if data.size > 0:
-                number_of_features += data[0].shape[-1]
-
-        return number_of_features
-
-    def convert_to_tf_dataset(
-        self, batch_size: int, batch_strategy: Text = "sequence", shuffle: bool = False
-    ):
-        """Create tf dataset."""
-
-        shapes, types = self._get_shapes_types()
-
-        return tf.data.Dataset.from_generator(
-            lambda batch_size_: self._gen_batch(batch_size_, batch_strategy, shuffle),
-            output_types=types,
-            output_shapes=shapes,
-            args=([batch_size]),
-        )
+        return final_data
 
     def prepare_batch(
         self,
+        data: Optional[Data] = None,
         start: Optional[int] = None,
         end: Optional[int] = None,
         tuple_sizes: Optional[Dict[Text, int]] = None,
     ) -> Tuple[Optional[np.ndarray]]:
         """Slices session data into batch using given start and end value."""
 
+        if not data:
+            data = self.data
+
         batch_data = []
 
-        for key, values in self.data.items():
+        for key, values in data.items():
             # add None for not present values during processing
             if not values:
                 if tuple_sizes:
@@ -341,29 +338,28 @@ def _gen_batch(
     ) -> Generator[Tuple, None, None]:
         """Generate batches."""
 
+        data = self.data
+
         if shuffle:
-            self.shuffle()
+            data = self.shuffled_data(data)
 
         if batch_strategy == "balanced":
-            self.balance(batch_size, shuffle)
+            data = self.balanced_data(data, batch_size, shuffle)
 
-        num_examples = self.get_number_of_examples()
-        num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
+        num_batches = self.num_examples // batch_size + int(self.num_examples % batch_size > 0)
 
         for batch_num in range(num_batches):
             start = batch_num * batch_size
             end = start + batch_size
 
-            yield self.prepare_batch(start, end)
+            yield self.prepare_batch(data, start, end)
 
     def _check_train_test_sizes(
         self, number_of_test_examples: int, label_counts: Dict[Any, int]
     ):
         """Check whether the test data set is too large or too small."""
 
-        number_of_total_examples = self.get_number_of_examples()
-
-        if number_of_test_examples >= number_of_total_examples - len(label_counts):
+        if number_of_test_examples >= self.num_examples - len(label_counts):
             raise ValueError(
                 f"Test set of {number_of_test_examples} is too large. Remaining "
                 f"train set should be at least equal to number of classes "
@@ -375,24 +371,25 @@ def _check_train_test_sizes(
                 f"be at least equal to number of classes {label_counts}."
             )
 
-    def _data_for_ids(self, ids: np.ndarray) -> Dict[Text, List[np.ndarray]]:
+    @staticmethod
+    def _data_for_ids(data: Data, ids: np.ndarray) -> Dict[Text, List[np.ndarray]]:
         """Filter session data by ids."""
 
         new_data = defaultdict(list)
-        for k, values in self.data.items():
+        for k, values in data.items():
             for v in values:
                 new_data[k].append(v[ids])
         return new_data
 
     def _split_by_label_ids(
-        self, label_ids: "np.ndarray", unique_label_ids: "np.ndarray"
+        self, data: Data, label_ids: "np.ndarray", unique_label_ids: "np.ndarray"
     ) -> List["RasaModelData"]:
         """Reorganize session data into a list of session data with the same labels."""
 
         label_data = []
         for label_id in unique_label_ids:
             ids = label_ids == label_id
-            label_data.append(RasaModelData(self.label_key, self._data_for_ids(ids)))
+            label_data.append(RasaModelData(self.label_key, self._data_for_ids(data, ids)))
         return label_data
 
     def _check_label_key(self, label_key: Text):
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 2aaca0637f30..d14e05d8f68b 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -58,7 +58,7 @@ def train_dataset_function(_batch_size):
 
         def evaluation_dataset_function(_batch_size):
             return evaluation_model_data.as_tf_dataset(
-                _batch_size, batch_strategy, shuffle=False
+                _batch_size, "sequence", shuffle=False
             )
 
         if eager:
@@ -66,12 +66,16 @@ def evaluation_dataset_function(_batch_size):
             tf_train_dataset_function = train_dataset_function
             tf_train_on_batch_function = self.train_on_batch
         else:
+            logger.debug("Building tensorflow train graph...")
             # allows increasing batch size
             tf_train_dataset_function = tf.function(func=train_dataset_function)
+            init_dataset = tf_train_dataset_function(tf_batch_size)
             tf_train_on_batch_function = tf.function(
                 self.train_on_batch,
-                input_signature=[tf_train_dataset_function(tf_batch_size).element_spec],
+                input_signature=[init_dataset.element_spec],
             )
+            tf_train_on_batch_function(next(iter(init_dataset)))
+            logger.debug("Finished building tensorflow train graph")
 
         if evaluate_on_num_examples > 0:
             if eager:
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index 12a03997e738..b035cc6eef50 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -60,14 +60,17 @@ async def model_data() -> RasaModelData:
 def test_shuffle_session_data(model_data: RasaModelData):
     before = model_data.values()
 
-    model_data.shuffle()
+    data = model_data.shuffled_data(model_data.data)
 
-    assert np.array(before) != np.array(model_data.values())
+    # check that original data didn't change
+    assert np.array(before) == np.array(model_data.values())
+    # check that new data is different
+    assert np.array(model_data.values()) != np.array(data.values())
 
 
 def test_split_session_data_by_label(model_data: RasaModelData):
     split_model_data = model_data._split_by_label_ids(
-        model_data.get("intent_ids")[0], np.array([0, 1])
+        model_data.data, model_data.get("intent_ids")[0], np.array([0, 1])
     )
 
     assert len(split_model_data) == 2
@@ -100,7 +103,7 @@ def test_train_val_split_incorrect_size(model_data: RasaModelData, size: int):
 
 
 def test_session_data_for_ids(model_data: RasaModelData):
-    filtered_data = model_data._data_for_ids(np.array([0, 1]))
+    filtered_data = model_data._data_for_ids(model_data.data, np.array([0, 1]))
 
     for values in filtered_data.values():
         for v in values:
@@ -142,6 +145,6 @@ def test_gen_batch(model_data: RasaModelData):
 
 
 def test_balance_session_data(model_data: RasaModelData):
-    model_data.balance(2, False)
+    data = model_data.balanced_data(model_data.data, 2, False)
 
-    assert np.all(model_data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))
+    assert np.all(data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))

From de9a688a1151724495992ba6cd8cf2cd1feead18 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 16:08:36 +0100
Subject: [PATCH 163/633] clean up init methods.

---
 .../nlu/classifiers/embedding_intent_classifier.py |  2 +-
 rasa/utils/tensorflow/tf_layers.py                 | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 682faab6d50f..3bf515124e46 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -855,7 +855,7 @@ def __init__(
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
     ) -> None:
-        super(DIET, self).__init__()
+        super().__init__()
 
         # data
         self.data_signature = data_signature
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 5fd8f82dd68b..2841dcb70562 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -29,9 +29,7 @@ class DenseForSparse(tf.keras.layers.Dense):
     def __init__(self, reg_lambda: float, **kwargs) -> None:
         l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
 
-        super(DenseForSparse, self).__init__(
-            kernel_regularizer=l1_regularizer, **kwargs
-        )
+        super().__init__(kernel_regularizer=l1_regularizer, **kwargs)
 
     def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         if not isinstance(inputs, tf.SparseTensor):
@@ -65,7 +63,7 @@ def __init__(
         reg_lambda: float,
         layer_name_suffix: Text,
     ) -> None:
-        super(ReluFfn, self).__init__(name=f"ffnn_{layer_name_suffix}")
+        super().__init__(name=f"ffnn_{layer_name_suffix}")
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._ffn_layers = []
@@ -97,7 +95,7 @@ def __init__(
         layer_name_suffix: Text,
         similarity_type: Optional[Text] = None,
     ) -> None:
-        super(Embed, self).__init__(name=f"embed_{layer_name_suffix}")
+        super().__init__(name=f"embed_{layer_name_suffix}")
 
         self.similarity_type = similarity_type
         if self.similarity_type and self.similarity_type not in {"cosine", "inner"}:
@@ -167,7 +165,7 @@ def _scaled_dot_product_attention(q, k, v, pad_mask):
         return output, attention_weights
 
     def __init__(self, d_model, num_heads: int, reg_lambda: float) -> None:
-        super(MultiHeadAttention, self).__init__()
+        super().__init__()
         self.num_heads = num_heads
         self.d_model = d_model
 
@@ -250,7 +248,7 @@ def __init__(
         reg_lambda: float,
         rate: float = 0.1,
     ) -> None:
-        super(TransformerEncoderLayer, self).__init__()
+        super().__init__()
 
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
         self._mha = MultiHeadAttention(d_model, num_heads, reg_lambda)
@@ -325,7 +323,7 @@ def __init__(
         unidirectional: bool = False,
         name: Optional[Text] = None,
     ) -> None:
-        super(TransformerEncoder, self).__init__(name=name)
+        super().__init__(name=name)
 
         self.d_model = d_model
         self.unidirectional = unidirectional

From 6cb0c54d51d086469cdce0ae61e6d3dd3058c41c Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Wed, 22 Jan 2020 16:24:31 +0100
Subject: [PATCH 164/633] Update rasa/utils/tensorflow/tf_layers.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/utils/tensorflow/tf_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 8ee4523606e5..288c180bc8f4 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -371,7 +371,7 @@ def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Ten
 
 
 class InputMask(tf.keras.layers.Layer):
-    def build(self, input_shape: "tf.TensorShape") -> None:
+    def build(self, input_shape: tf.TensorShape) -> None:
         initializer = tf.keras.initializers.GlorotUniform()
         self.mask_vector = self.add_weight(
             shape=(1, 1, input_shape[-1]),

From 51d2776b870eebb6e004a896977b6f593d1b0cb6 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Wed, 22 Jan 2020 16:24:45 +0100
Subject: [PATCH 165/633] Update rasa/utils/tensorflow/tf_model_data.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/utils/tensorflow/tf_model_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index 1081e4cbe2aa..a08f4daf90d8 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -41,7 +41,7 @@ def feature_not_exists(self, key: Text) -> bool:
         return key not in self.data or not self.data[key]
 
     def get_number_of_examples(self) -> int:
-        """Obtain number of examples in session data.
+        """Obtain number of examples in data.
 
         Raise a ValueError if number of examples differ for different data in
         session data.

From 184ee8d4f6faeba22b6bb2690d8909e1cd73c64b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 17:46:51 +0100
Subject: [PATCH 166/633] review comments

---
 .../embedding_intent_classifier.py            | 187 ++++++++----------
 rasa/utils/tensorflow/tf_models.py            |  30 +--
 2 files changed, 102 insertions(+), 115 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3bf515124e46..6753367981f2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -855,7 +855,7 @@ def __init__(
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
     ) -> None:
-        super().__init__()
+        super().__init__(name="DIET")
 
         # data
         self.data_signature = data_signature
@@ -868,7 +868,8 @@ def __init__(
         self.config = config
 
         # tf objects
-        self._prepare_layers()
+        self._tf_layers: Dict[Text, Any] = {"embed": {}}
+        self._tf_prepare_layers()
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
@@ -882,22 +883,21 @@ def __init__(
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
 
-        if self.config[INTENT_CLASSIFICATION]:
-            self.metrics_to_log += ["i_acc", "i_loss"]
-        if self.config[MASKED_LM]:
-            self.metrics_to_log += ["m_acc", "m_loss"]
-        if self.config[ENTITY_RECOGNITION]:
-            self.metrics_to_log += ["e_loss", "e_f1"]
-
         # persist
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
-    def _prepare_layers(self) -> None:
-        self._embed = {}
-        self._sparse_dropout = tf_layers.SparseDropout(rate=self.config[DROPRATE])
+    def _tf_prepare_layers(self) -> None:
+        self._prepare_sequence_layers()
+        self._prepare_mask_lm_layers()
+        self._prepare_intent_classification_layers()
+        self._prepare_entity_recognition_layers()
 
-        self._sparse_to_dense = {
+    def _prepare_sequence_layers(self):
+        self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
+            rate=self.config[DROPRATE]
+        )
+        self._tf_layers["sparse_to_dense"] = {
             "text": self._create_sparse_dense_layer(
                 self.data_signature["text_features"],
                 "text",
@@ -911,8 +911,7 @@ def _prepare_layers(self) -> None:
                 self.config[DENSE_DIM]["label"],
             ),
         }
-
-        self._ffnn = {
+        self._tf_layers["ffnn"] = {
             "text": tf_layers.ReluFfn(
                 self.config[HIDDEN_LAYERS_SIZES_TEXT],
                 self.config[DROPRATE],
@@ -926,9 +925,8 @@ def _prepare_layers(self) -> None:
                 "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
             ),
         }
-
-        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
-            self._transformer = tf_layers.TransformerEncoder(
+        self._tf_layers["transformer"] = (
+            tf_layers.TransformerEncoder(
                 self.config[NUM_TRANSFORMER_LAYERS],
                 self.config[TRANSFORMER_SIZE],
                 self.config[NUM_HEADS],
@@ -939,79 +937,66 @@ def _prepare_layers(self) -> None:
                 self.config[UNIDIRECTIONAL_ENCODER],
                 name="text_encoder",
             )
-        else:
-            self._transformer = lambda x, mask, training: x
-
-        self._prepare_mask_lm_layers()
-        self._prepare_intent_classification_layers()
-        self._prepare_entity_recognition_layers()
+            if self.config[NUM_TRANSFORMER_LAYERS] > 0
+            else lambda x, mask, training: x
+        )
 
     def _prepare_mask_lm_layers(self):
-        if self.config[MASKED_LM]:
-            self._input_mask = tf_layers.InputMask()
-            self._embed["text_mask"] = tf_layers.Embed(
-                self.config[EMBED_DIM],
-                self.config[C2],
-                "text_mask",
-                self.config[SIMILARITY_TYPE],
-            )
-            self._embed["text_token"] = tf_layers.Embed(
-                self.config[EMBED_DIM],
-                self.config[C2],
-                "text_token",
-                self.config[SIMILARITY_TYPE],
-            )
-            self._loss_mask = tf_layers.DotProductLoss(
-                self.config[NUM_NEG],
-                self.config[LOSS_TYPE],
-                self.config[MU_POS],
-                self.config[MU_NEG],
-                self.config[USE_MAX_SIM_NEG],
-                self.config[C_EMB],
-                self.config[SCALE_LOSS],
-            )
-        else:
-            self._input_mask = None
-            self._loss_mask = None
+        self._tf_layers["input_mask"] = tf_layers.InputMask()
+        self._tf_layers["embed"]["text_mask"] = tf_layers.Embed(
+            self.config[EMBED_DIM],
+            self.config[C2],
+            "text_mask",
+            self.config[SIMILARITY_TYPE],
+        )
+        self._tf_layers["embed"]["text_token"] = tf_layers.Embed(
+            self.config[EMBED_DIM],
+            self.config[C2],
+            "text_token",
+            self.config[SIMILARITY_TYPE],
+        )
+        self._tf_layers["loss_mask"] = tf_layers.DotProductLoss(
+            self.config[NUM_NEG],
+            self.config[LOSS_TYPE],
+            self.config[MU_POS],
+            self.config[MU_NEG],
+            self.config[USE_MAX_SIM_NEG],
+            self.config[C_EMB],
+            self.config[SCALE_LOSS],
+        )
 
     def _prepare_intent_classification_layers(self):
-        if self.config[INTENT_CLASSIFICATION]:
-            self._embed["text"] = tf_layers.Embed(
-                self.config[EMBED_DIM],
-                self.config[C2],
-                "text",
-                self.config[SIMILARITY_TYPE],
-            )
-            self._embed["label"] = tf_layers.Embed(
-                self.config[EMBED_DIM],
-                self.config[C2],
-                "label",
-                self.config[SIMILARITY_TYPE],
-            )
-            self._loss_label = tf_layers.DotProductLoss(
-                self.config[NUM_NEG],
-                self.config[LOSS_TYPE],
-                self.config[MU_POS],
-                self.config[MU_NEG],
-                self.config[USE_MAX_SIM_NEG],
-                self.config[C_EMB],
-                self.config[SCALE_LOSS],
-            )
-        else:
-            self._loss_label = None
+        self._tf_layers["embed"]["text"] = tf_layers.Embed(
+            self.config[EMBED_DIM],
+            self.config[C2],
+            "text",
+            self.config[SIMILARITY_TYPE],
+        )
+        self._tf_layers["embed"]["label"] = tf_layers.Embed(
+            self.config[EMBED_DIM],
+            self.config[C2],
+            "label",
+            self.config[SIMILARITY_TYPE],
+        )
+        self._tf_layers["loss_label"] = tf_layers.DotProductLoss(
+            self.config[NUM_NEG],
+            self.config[LOSS_TYPE],
+            self.config[MU_POS],
+            self.config[MU_NEG],
+            self.config[USE_MAX_SIM_NEG],
+            self.config[C_EMB],
+            self.config[SCALE_LOSS],
+        )
 
     def _prepare_entity_recognition_layers(self):
-        self._crf = None
-        if self.config[ENTITY_RECOGNITION]:
-            self._embed["logits"] = tf_layers.Embed(
-                self._num_tags, self.config[C2], "logits"
-            )
-            self._crf = tf_layers.CRF(self._num_tags, self.config[C2])
-
-            self.metric_f1_score = tfa.metrics.F1Score(
-                num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
-                average="micro",
-            )
+        self._tf_layers["embed"]["logits"] = tf_layers.Embed(
+            self._num_tags, self.config[C2], "logits"
+        )
+        self._tf_layers["crf"] = tf_layers.CRF(self._num_tags, self.config[C2])
+        self._tf_layers["crf_f1_score"] = tfa.metrics.F1Score(
+            num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+            average="micro",
+        )
 
     def set_training_phase(self, training: bool) -> None:
         if training:
@@ -1032,11 +1017,11 @@ def _combine_sparse_dense_features(
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 if sparse_dropout:
-                    _f = self._sparse_dropout(f, self.training)
+                    _f = self._tf_layers["sparse_dropout"](f, self.training)
                 else:
                     _f = f
 
-                dense_features.append(self._sparse_to_dense[name](_f))
+                dense_features.append(self._tf_layers["sparse_to_dense"][name](_f))
             else:
                 dense_features.append(f)
 
@@ -1050,7 +1035,7 @@ def _create_bow(
     ) -> tf.Tensor:
 
         x = self._combine_sparse_dense_features(features, mask, name)
-        return self._ffnn[name](tf.reduce_sum(x, 1), self.training)
+        return self._tf_layers["ffnn"][name](tf.reduce_sum(x, 1), self.training)
 
     def _create_sequence(
         self,
@@ -1064,11 +1049,11 @@ def _create_sequence(
         )
 
         if masked_lm_loss:
-            pre, lm_mask_bool = self._input_mask(x, mask, self.training)
+            pre, lm_mask_bool = self._tf_layers["input_mask"](x, mask, self.training)
         else:
             pre, lm_mask_bool = (x, None)
 
-        transformed = self._transformer(pre, 1 - mask, self.training)
+        transformed = self._tf_layers["transformer"](pre, 1 - mask, self.training)
         transformed = tf.nn.relu(transformed)
 
         return transformed, x, lm_mask_bool
@@ -1085,8 +1070,8 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
         a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
         a_masked = tf.boolean_mask(a, lm_mask_bool)
 
-        a_t_masked_embed = self._embed[f"{name}_mask"](a_t_masked)
-        a_masked_embed = self._embed[f"{name}_token"](a_masked)
+        a_t_masked_embed = self._tf_layers["embed"][f"{name}_mask"](a_t_masked)
+        a_masked_embed = self._tf_layers["embed"][f"{name}_token"](a_masked)
 
         return self._loss_mask(
             a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
@@ -1098,17 +1083,19 @@ def _build_all_b(self):
             self.tf_label_data["label_mask"][0],
             "label",
         )
-        all_labels_embed = self._embed["label"](all_labels)
+        all_labels_embed = self._tf_layers["embed"]["label"](all_labels)
 
         return all_labels_embed, all_labels
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         all_labels_embed, all_labels = self._build_all_b()
 
-        a_embed = self._embed["text"](a)
-        b_embed = self._embed["label"](b)
+        a_embed = self._tf_layers["embed"]["text"](a)
+        b_embed = self._tf_layers["embed"]["label"](b)
 
-        return self._loss_label(a_embed, b_embed, b, all_labels_embed, all_labels)
+        return self._tf_layers["loss_label"](
+            a_embed, b_embed, b, all_labels_embed, all_labels
+        )
 
     def _entity_loss(
         self, a: tf.Tensor, c: tf.Tensor, mask: tf.Tensor, sequence_lengths
@@ -1117,10 +1104,10 @@ def _entity_loss(
         # remove cls token
         sequence_lengths = sequence_lengths - 1
         c = tf.cast(c[:, :, 0], tf.int32)
-        logits = self._embed["logits"](a)
+        logits = self._tf_layers["embed"]["logits"](a)
 
-        loss = self._crf.loss(logits, c, sequence_lengths)
-        pred_ids = self._crf(logits, sequence_lengths)
+        loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
+        pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
 
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
@@ -1132,7 +1119,7 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        f1 = self.metric_f1_score(c_masked_1, pred_ids_masked_1)
+        f1 = self._tf_layers["crf_f1_score"](c_masked_1, pred_ids_masked_1)
 
         return loss, f1
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 5e80841f44d3..1a836909d508 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -18,7 +18,6 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
-        self.metrics_to_log = ["t_loss"]
 
     def fit(
         self,
@@ -48,22 +47,27 @@ def fit(
 
         disable = silent or is_logging_disabled()
 
+        tf_batch_size = tf.ones((), tf.int32)
         (
             tf_train_dataset_function,
             tf_train_on_batch_function,
-        ) = self._get_tf_train_functions(eager, model_data, batch_strategy)
+        ) = self._get_tf_train_functions(
+            eager, model_data, batch_strategy, tf_batch_size
+        )
 
         (
             tf_evaluation_dataset_function,
             tf_evaluation_on_batch_function,
         ) = self._get_tf_evaluation_functions(
-            eager, evaluate_on_num_examples, evaluation_model_data
+            eager, evaluate_on_num_examples, evaluation_model_data, tf_batch_size
         )
 
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
         for ep in pbar:
-            ep_batch_size = self.linearly_increasing_batch_size(ep, batch_size, epochs)
+            ep_batch_size = tf_batch_size * self.linearly_increasing_batch_size(
+                ep, batch_size, epochs
+            )
 
             self._reset_metrics()
 
@@ -92,7 +96,11 @@ def fit(
             logger.info("Finished training.")
 
     def _get_tf_train_functions(
-        self, eager: bool, model_data: RasaModelData, batch_strategy: Text
+        self,
+        eager: bool,
+        model_data: RasaModelData,
+        batch_strategy: Text,
+        tf_batch_size: tf.Tensor,
     ) -> Tuple[Callable, Callable]:
         def train_dataset_function(_batch_size):
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
@@ -104,7 +112,6 @@ def train_dataset_function(_batch_size):
             logger.debug("Building tensorflow train graph...")
             # allows increasing batch size
             tf_train_dataset_function = tf.function(func=train_dataset_function)
-            tf_batch_size = tf.ones((), tf.int32)
             init_dataset = tf_train_dataset_function(tf_batch_size)
             tf_train_on_batch_function = tf.function(
                 self.train_on_batch, input_signature=[init_dataset.element_spec]
@@ -119,6 +126,7 @@ def _get_tf_evaluation_functions(
         eager: bool,
         evaluate_on_num_examples: int,
         evaluation_model_data: RasaModelData,
+        tf_batch_size: tf.Tensor,
     ) -> Tuple[Callable, Callable]:
         def evaluation_dataset_function(_batch_size):
             return evaluation_model_data.as_tf_dataset(
@@ -130,7 +138,6 @@ def evaluation_dataset_function(_batch_size):
                 tf_evaluation_dataset_function = evaluation_dataset_function
                 tf_evaluation_on_batch_function = self.evaluate_on_batch
             else:
-                tf_batch_size = tf.ones((), tf.int32)
                 tf_evaluation_dataset_function = tf.function(
                     func=evaluation_dataset_function
                 )
@@ -153,7 +160,6 @@ def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]
         return {
             f"{prefix}{metric.name}": f"{metric.result().numpy():.3f}"
             for metric in self.metrics
-            if metric.name in self.metrics_to_log
         }
 
     def _reset_metrics(self) -> None:
@@ -162,13 +168,7 @@ def _reset_metrics(self) -> None:
             metric.reset_states()
 
     def _get_losses_from_metrics(self) -> List[tf.Tensor]:
-        return list(
-            [
-                m.result()
-                for m in self.metrics
-                if "loss" in m.name.lower() and m.name in self.metrics_to_log
-            ]
-        )
+        return list([m.result() for m in self.metrics if "loss" in m.name.lower()])
 
     def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs

From a56f9db4d7f713e9ebd14e83ad809fcb65f1fa31 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 17:55:10 +0100
Subject: [PATCH 167/633] naming

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 6753367981f2..56a839ecc20d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -868,8 +868,7 @@ def __init__(
         self.config = config
 
         # tf objects
-        self._tf_layers: Dict[Text, Any] = {"embed": {}}
-        self._tf_prepare_layers()
+        self._prepare_layers()
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
@@ -887,8 +886,10 @@ def __init__(
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
-    def _tf_prepare_layers(self) -> None:
+    def _prepare_layers(self) -> None:
+        self._tf_layers = {}
         self._prepare_sequence_layers()
+        self._tf_layers["embed"] = {}
         self._prepare_mask_lm_layers()
         self._prepare_intent_classification_layers()
         self._prepare_entity_recognition_layers()

From 694637806cdb52afaf91a29d5bf99dee951434f4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 22 Jan 2020 17:59:07 +0100
Subject: [PATCH 168/633] reintroduce metrics to log

---
 rasa/nlu/classifiers/embedding_intent_classifier.py |  9 +++++++++
 rasa/utils/tensorflow/tf_models.py                  | 10 +++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 56a839ecc20d..3b3ce7b77391 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -881,11 +881,20 @@ def __init__(
         self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
+        self._update_metrics_to_log()
 
         # persist
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
+    def _update_metrics_to_log(self):
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+        if self.config[INTENT_CLASSIFICATION]:
+            self.metrics_to_log += ["i_loss", "i_acc"]
+        if self.config[ENTITY_RECOGNITION]:
+            self.metrics_to_log += ["e_loss", "e_f1"]
+
     def _prepare_layers(self) -> None:
         self._tf_layers = {}
         self._prepare_sequence_layers()
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 1a836909d508..d66fcec7fb65 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -18,6 +18,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
+        self.metrics_to_log = ["t_loss"]
 
     def fit(
         self,
@@ -160,6 +161,7 @@ def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]
         return {
             f"{prefix}{metric.name}": f"{metric.result().numpy():.3f}"
             for metric in self.metrics
+            if metric.name in self.metrics_to_log
         }
 
     def _reset_metrics(self) -> None:
@@ -168,7 +170,13 @@ def _reset_metrics(self) -> None:
             metric.reset_states()
 
     def _get_losses_from_metrics(self) -> List[tf.Tensor]:
-        return list([m.result() for m in self.metrics if "loss" in m.name.lower()])
+        return list(
+            [
+                m.result()
+                for m in self.metrics
+                if "loss" in m.name.lower() and m.name in self.metrics_to_log
+            ]
+        )
 
     def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs

From 39c83e26fc921a50d780666f0a323f40ce5d5976 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 09:25:48 +0100
Subject: [PATCH 169/633] review comments

---
 .../embedding_intent_classifier.py            | 103 +++++++++---------
 1 file changed, 49 insertions(+), 54 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3b3ce7b77391..4a554dfaeadf 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -887,7 +887,7 @@ def __init__(
         self.all_labels_embed = None
         self.batch_tuple_sizes = None
 
-    def _update_metrics_to_log(self):
+    def _update_metrics_to_log(self) -> None:
         if self.config[MASKED_LM]:
             self.metrics_to_log += ["m_loss", "m_acc"]
         if self.config[INTENT_CLASSIFICATION]:
@@ -898,43 +898,38 @@ def _update_metrics_to_log(self):
     def _prepare_layers(self) -> None:
         self._tf_layers = {}
         self._prepare_sequence_layers()
-        self._tf_layers["embed"] = {}
         self._prepare_mask_lm_layers()
         self._prepare_intent_classification_layers()
         self._prepare_entity_recognition_layers()
 
-    def _prepare_sequence_layers(self):
+    def _prepare_sequence_layers(self) -> None:
         self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
             rate=self.config[DROPRATE]
         )
-        self._tf_layers["sparse_to_dense"] = {
-            "text": self._create_sparse_dense_layer(
-                self.data_signature["text_features"],
-                "text",
-                self.config[C2],
-                self.config[DENSE_DIM]["text"],
-            ),
-            "label": self._create_sparse_dense_layer(
-                self.data_signature["label_features"],
-                "label",
-                self.config[C2],
-                self.config[DENSE_DIM]["label"],
-            ),
-        }
-        self._tf_layers["ffnn"] = {
-            "text": tf_layers.ReluFfn(
-                self.config[HIDDEN_LAYERS_SIZES_TEXT],
-                self.config[DROPRATE],
-                self.config[C2],
-                "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
-            ),
-            "label": tf_layers.ReluFfn(
-                self.config[HIDDEN_LAYERS_SIZES_LABEL],
-                self.config[DROPRATE],
-                self.config[C2],
-                "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
-            ),
-        }
+        self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
+            self.data_signature["text_features"],
+            "text",
+            self.config[C2],
+            self.config[DENSE_DIM]["text"],
+        )
+        self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
+            self.data_signature["label_features"],
+            "label",
+            self.config[C2],
+            self.config[DENSE_DIM]["label"],
+        )
+        self._tf_layers["ffnn.text"] = tf_layers.ReluFfn(
+            self.config[HIDDEN_LAYERS_SIZES_TEXT],
+            self.config[DROPRATE],
+            self.config[C2],
+            "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
+        )
+        self._tf_layers["ffnn.label"] = tf_layers.ReluFfn(
+            self.config[HIDDEN_LAYERS_SIZES_LABEL],
+            self.config[DROPRATE],
+            self.config[C2],
+            "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
+        )
         self._tf_layers["transformer"] = (
             tf_layers.TransformerEncoder(
                 self.config[NUM_TRANSFORMER_LAYERS],
@@ -951,15 +946,15 @@ def _prepare_sequence_layers(self):
             else lambda x, mask, training: x
         )
 
-    def _prepare_mask_lm_layers(self):
+    def _prepare_mask_lm_layers(self) -> None:
         self._tf_layers["input_mask"] = tf_layers.InputMask()
-        self._tf_layers["embed"]["text_mask"] = tf_layers.Embed(
+        self._tf_layers["embed.lm_mask"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "text_mask",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed"]["text_token"] = tf_layers.Embed(
+        self._tf_layers["embed.golden_token"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "text_token",
@@ -975,14 +970,14 @@ def _prepare_mask_lm_layers(self):
             self.config[SCALE_LOSS],
         )
 
-    def _prepare_intent_classification_layers(self):
-        self._tf_layers["embed"]["text"] = tf_layers.Embed(
+    def _prepare_intent_classification_layers(self) -> None:
+        self._tf_layers["embed.text"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "text",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed"]["label"] = tf_layers.Embed(
+        self._tf_layers["embed.label"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "label",
@@ -998,8 +993,8 @@ def _prepare_intent_classification_layers(self):
             self.config[SCALE_LOSS],
         )
 
-    def _prepare_entity_recognition_layers(self):
-        self._tf_layers["embed"]["logits"] = tf_layers.Embed(
+    def _prepare_entity_recognition_layers(self) -> None:
+        self._tf_layers["embed.logits"] = tf_layers.Embed(
             self._num_tags, self.config[C2], "logits"
         )
         self._tf_layers["crf"] = tf_layers.CRF(self._num_tags, self.config[C2])
@@ -1031,7 +1026,7 @@ def _combine_sparse_dense_features(
                 else:
                     _f = f
 
-                dense_features.append(self._tf_layers["sparse_to_dense"][name](_f))
+                dense_features.append(self._tf_layers[f"sparse_to_dense.{name}"](_f))
             else:
                 dense_features.append(f)
 
@@ -1045,7 +1040,7 @@ def _create_bow(
     ) -> tf.Tensor:
 
         x = self._combine_sparse_dense_features(features, mask, name)
-        return self._tf_layers["ffnn"][name](tf.reduce_sum(x, 1), self.training)
+        return self._tf_layers[f"ffnn.{name}"](tf.reduce_sum(x, 1), self.training)
 
     def _create_sequence(
         self,
@@ -1053,7 +1048,7 @@ def _create_sequence(
         mask: tf.Tensor,
         name: Text,
         masked_lm_loss: bool = False,
-    ):
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         x = self._combine_sparse_dense_features(
             features, mask, name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT]
         )
@@ -1068,7 +1063,9 @@ def _create_sequence(
 
         return transformed, x, lm_mask_bool
 
-    def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
+    def _mask_loss(
+        self, a_transformed: tf.Tensor, a: tf.Tensor, lm_mask_bool: tf.Tensor
+    ) -> tf.Tensor:
         # make sure there is at least one element in the mask
         lm_mask_bool = tf.cond(
             tf.reduce_any(lm_mask_bool),
@@ -1080,28 +1077,28 @@ def _mask_loss(self, a_transformed, a, lm_mask_bool, name):
         a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
         a_masked = tf.boolean_mask(a, lm_mask_bool)
 
-        a_t_masked_embed = self._tf_layers["embed"][f"{name}_mask"](a_t_masked)
-        a_masked_embed = self._tf_layers["embed"][f"{name}_token"](a_masked)
+        a_t_masked_embed = self._tf_layers["embed.lm_mask"](a_t_masked)
+        a_masked_embed = self._tf_layers["embed.golden_token"](a_masked)
 
-        return self._loss_mask(
+        return self._tf_layers["loss_mask"](
             a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
         )
 
-    def _build_all_b(self):
+    def _build_all_b(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_labels = self._create_bow(
             self.tf_label_data["label_features"],
             self.tf_label_data["label_mask"][0],
             "label",
         )
-        all_labels_embed = self._tf_layers["embed"]["label"](all_labels)
+        all_labels_embed = self._tf_layers["embed.label"](all_labels)
 
         return all_labels_embed, all_labels
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         all_labels_embed, all_labels = self._build_all_b()
 
-        a_embed = self._tf_layers["embed"]["text"](a)
-        b_embed = self._tf_layers["embed"]["label"](b)
+        a_embed = self._tf_layers["embed.text"](a)
+        b_embed = self._tf_layers["embed.label"](b)
 
         return self._tf_layers["loss_label"](
             a_embed, b_embed, b, all_labels_embed, all_labels
@@ -1114,7 +1111,7 @@ def _entity_loss(
         # remove cls token
         sequence_lengths = sequence_lengths - 1
         c = tf.cast(c[:, :, 0], tf.int32)
-        logits = self._tf_layers["embed"]["logits"](a)
+        logits = self._tf_layers["embed.logits"](a)
 
         loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
@@ -1146,9 +1143,7 @@ def _train_losses_scores(
         )
 
         if self.config[MASKED_LM]:
-            loss, acc = self._mask_loss(
-                text_transformed, text_in, lm_mask_bool_text, "text"
-            )
+            loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text)
             self.mask_loss.update_state(loss)
             self.mask_acc.update_state(acc)
 

From 5437ac8ed68578808b460cacf43995ec98639906 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 10:56:00 +0100
Subject: [PATCH 170/633] Create TED model

---
 rasa/core/policies/embedding_policy.py        | 315 +++++++++++-------
 .../embedding_intent_classifier.py            |   8 -
 2 files changed, 203 insertions(+), 120 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 306a17e66118..1f5e0f979e0f 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -6,7 +6,7 @@
 import warnings
 
 import numpy as np
-from typing import Any, List, Optional, Text, Dict, Tuple
+from typing import Any, List, Optional, Text, Dict, Tuple, Union, Callable
 
 import rasa.utils.io
 from rasa.core.domain import Domain
@@ -24,12 +24,41 @@
 import tensorflow as tf
 
 # avoid warning println on contrib import - remove for tf 2
+from utils.tensorflow import tf_models, tf_layers
 from utils.train_utils import TrainingMetrics
 
 tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
 
+# constants - configuration parameters
+HIDDEN_LAYERS_SIZES_PRE_DIAL = "hidden_layers_sizes_pre_dial"
+HIDDEN_LAYERS_SIZES_BOT = "hidden_layers_sizes_bot"
+TRANSFORMER_SIZE = "transformer_size"
+NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
+NUM_HEADS = "number_of_attention_heads"
+POS_ENCODING = "positional_encoding"
+MAX_SEQ_LENGTH = "maximum_sequence_length"
+BATCH_SIZES = "batch_sizes"
+BATCH_STRATEGY = "batch_strategy"
+EPOCHS = "epochs"
+RANDOM_SEED = "random_seed"
+EMBED_DIM = "embedding_dimension"
+NUM_NEG = "number_of_negative_examples"
+SIMILARITY_TYPE = "similarity_type"
+LOSS_TYPE = "loss_type"
+MU_POS = "maximum_positive_similarity"
+MU_NEG = "maximum_negative_similarity"
+USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
+SCALE_LOSS = "scale_loss"
+C2 = "l2_regularization"
+C_EMB = "c_emb"
+DROPRATE_DIAL = "droprate_dial"
+DROPRATE_BOT = "droprate_bot"
+EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
+EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
+
+
 class EmbeddingPolicy(Policy):
     """Transformer Embedding Dialogue Policy (TEDP)
 
@@ -43,64 +72,64 @@ class EmbeddingPolicy(Policy):
         # nn architecture
         # a list of hidden layers sizes before user embed layer
         # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_pre_dial": [],
+        HIDDEN_LAYERS_SIZES_PRE_DIAL: [],
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_bot": [],
+        HIDDEN_LAYERS_SIZES_BOT: [],
         # number of units in transformer
-        "transformer_size": 128,
+        TRANSFORMER_SIZE: 128,
         # number of transformer layers
-        "num_transformer_layers": 1,
+        NUM_TRANSFORMER_LAYERS: 1,
         # type of positional encoding in transformer
-        "pos_encoding": "timing",  # string 'timing' or 'emb'
+        POS_ENCODING: "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
-        "max_seq_length": 256,
+        MAX_SEQ_LENGTH: 256,
         # number of attention heads in transformer
-        "num_heads": 4,
+        NUM_HEADS: 4,
         # training parameters
         # initial and final batch sizes:
         # batch size will be linearly increased for each epoch
-        "batch_size": [8, 32],
+        BATCH_SIZES: [8, 32],
         # how to create batches
-        "batch_strategy": "balanced",  # string 'sequence' or 'balanced'
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
         # number of epochs
-        "epochs": 1,
+        EPOCHS: 1,
         # set random seed to any int to get reproducible results
-        "random_seed": None,
+        RANDOM_SEED: None,
         # embedding parameters
         # dimension size of embedding vectors
-        "embed_dim": 20,
+        EMBED_DIM: 20,
         # the type of the similarity
-        "num_neg": 20,
+        NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect labels
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
+        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
         # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
+        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
-        "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect labels
-        "mu_neg": -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MU_NEG: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
         # the number of incorrect labels, the algorithm will minimize
         # their similarity to the user input during training
-        "use_max_sim_neg": True,  # flag which loss function to use
+        USE_MAX_SIM_NEG: True,  # flag which loss function to use
         # scale loss inverse proportionally to confidence of correct prediction
-        "scale_loss": True,
+        SCALE_LOSS: True,
         # regularization
         # the scale of L2 regularization
-        "C2": 0.001,
+        C2: 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different labels
-        "C_emb": 0.8,
+        C_EMB: 0.8,
         # dropout rate for dial nn
-        "droprate_a": 0.1,
+        DROPRATE_DIAL: 0.1,
         # dropout rate for bot nn
-        "droprate_b": 0.0,
+        DROPRATE_BOT: 0.0,
         # visualization of accuracy
         # how often calculate validation accuracy
-        "evaluate_every_num_epochs": 20,  # small values may hurt performance
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for hold out validation set
-        "evaluate_on_num_examples": 0,  # large values may hurt performance
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -117,18 +146,9 @@ def __init__(
         self,
         featurizer: Optional["TrackerFeaturizer"] = None,
         priority: int = DEFAULT_POLICY_PRIORITY,
-        graph: Optional["tf.Graph"] = None,
-        session: Optional["tf.Session"] = None,
-        user_placeholder: Optional["tf.Tensor"] = None,
-        bot_placeholder: Optional["tf.Tensor"] = None,
-        similarity_all: Optional["tf.Tensor"] = None,
-        pred_confidence: Optional["tf.Tensor"] = None,
-        similarity: Optional["tf.Tensor"] = None,
-        dial_embed: Optional["tf.Tensor"] = None,
-        bot_embed: Optional["tf.Tensor"] = None,
-        all_bot_embed: Optional["tf.Tensor"] = None,
-        attention_weights: Optional["tf.Tensor"] = None,
         max_history: Optional[int] = None,
+        model: Optional[tf_models.RasaModel] = None,
+        predict_func: Optional[Callable] = None,
         **kwargs: Any,
     ) -> None:
         """Declare instant variables with default values"""
@@ -139,88 +159,27 @@ def __init__(
 
         self._load_params(**kwargs)
 
+        self.model = model
+        self.predict_func = predict_func
+
         # encode all label_ids with numbers
         self._encoded_all_label_ids = None
 
-        # tf related instances
-        self.graph = graph
-        self.session = session
-        self.a_in = user_placeholder
-        self.b_in = bot_placeholder
-        self.sim_all = similarity_all
-        self.pred_confidence = pred_confidence
-        self.sim = similarity
-
-        # persisted embeddings
-        self.dial_embed = dial_embed
-        self.bot_embed = bot_embed
-        self.all_bot_embed = all_bot_embed
-
-        self.attention_weights = attention_weights
-        # internal tf instances
-        self._iterator = None
-        self._train_op = None
-        self._is_training = None
+        self._tf_config = train_utils.load_tf_config(self.config)
 
     # init helpers
-    def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layers_sizes = {
-            "pre_dial": config["hidden_layers_sizes_pre_dial"],
-            "bot": config["hidden_layers_sizes_bot"],
-        }
-
-        self.pos_encoding = config["pos_encoding"]
-        self.max_seq_length = config["max_seq_length"]
-        self.num_heads = config["num_heads"]
-
-        self.transformer_size = config["transformer_size"]
-        self.num_transformer_layers = config["num_transformer_layers"]
-
-        self.batch_size = config["batch_size"]
-        self.batch_strategy = config["batch_strategy"]
-
-        self.epochs = config["epochs"]
-
-        self.random_seed = config["random_seed"]
-
-    def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
-        self.embed_dim = config["embed_dim"]
-        self.num_neg = config["num_neg"]
-
-        self.similarity_type = config["similarity_type"]
-        self.loss_type = config["loss_type"]
-        if self.similarity_type == "auto":
-            if self.loss_type == "softmax":
-                self.similarity_type = "inner"
-            elif self.loss_type == "margin":
-                self.similarity_type = "cosine"
-
-        self.mu_pos = config["mu_pos"]
-        self.mu_neg = config["mu_neg"]
-        self.use_max_sim_neg = config["use_max_sim_neg"]
-
-        self.scale_loss = config["scale_loss"]
-
-    def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
-        self.C2 = config["C2"]
-        self.C_emb = config["C_emb"]
-        self.droprate = {"bot": config["droprate_b"], "dial": config["droprate_a"]}
-
-    def _load_visual_params(self, config: Dict[Text, Any]) -> None:
-        self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
-        if self.evaluate_every_num_epochs < 1:
-            self.evaluate_every_num_epochs = self.epochs
-        self.evaluate_on_num_examples = config["evaluate_on_num_examples"]
-
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
-        config = copy.deepcopy(self.defaults)
-        config.update(kwargs)
+        self.config = copy.deepcopy(self.defaults)
+        self.config.update(kwargs)
+
+        if self.config[SIMILARITY_TYPE] == "auto":
+            if self.config[LOSS_TYPE] == "softmax":
+                self.config[SIMILARITY_TYPE] = "inner"
+            elif self.config[LOSS_TYPE] == "margin":
+                self.config[SIMILARITY_TYPE] = "cosine"
 
-        self._tf_config = train_utils.load_tf_config(config)
-        self._load_nn_architecture_params(config)
-        self._load_embedding_params(config)
-        self._load_regularization_params(config)
-        self._load_visual_params(config)
+        if self.config[EVAL_NUM_EPOCHS] < 1:
+            self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
 
     # data helpers
     # noinspection PyPep8Naming
@@ -679,3 +638,135 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             all_bot_embed=all_bot_embed,
             attention_weights=attention_weights,
         )
+
+
+class TED(tf_models.RasaModel):
+    def __init__(self, config: Dict[Text, Any]):
+        super().__init__()
+
+        self.config = config
+
+        # tf tensors
+        self.training = tf.ones((), tf.bool)
+
+        # persist
+        self.all_bot_embed = None
+
+        self.metric_loss = tf.keras.metrics.Mean(name="loss")
+        self.metric_acc = tf.keras.metrics.Mean(name="acc")
+
+        self._loss_label = tf_layers.DotProductLoss(
+            self.config[NUM_NEG],
+            self.config[LOSS_TYPE],
+            self.config[MU_POS],
+            self.config[MU_NEG],
+            self.config[USE_MAX_SIM_NEG],
+            self.config[C_EMB],
+            self.config[SCALE_LOSS],
+        )
+        self._ffnn_pre_dial = tf_layers.ReluFfn(
+            self.config[HIDDEN_LAYERS_SIZES_PRE_DIAL],
+            self.config[DROPRATE_DIAL],
+            self.config[C2],
+            layer_name_suffix="pre_dial",
+        )
+        self._ffnn_bot = tf_layers.ReluFfn(
+            self.config[HIDDEN_LAYERS_SIZES_BOT],
+            self.config[DROPRATE_BOT],
+            self.config[C2],
+            layer_name_suffix="bot",
+        )
+        self._transformer = tf_layers.TransformerEncoder(
+            self.config[NUM_TRANSFORMER_LAYERS],
+            self.config[TRANSFORMER_SIZE],
+            self.config[NUM_HEADS],
+            self.config[TRANSFORMER_SIZE] * 4,
+            self.config[MAX_SEQ_LENGTH],
+            self.config[C2],
+            self.config[DROPRATE_DIAL],
+            name="dial_encoder",
+        )
+        self._embed_dial = tf_layers.Embed(
+            self.config[EMBED_DIM],
+            self.config[C2],
+            "dial",
+            self.config[SIMILARITY_TYPE],
+        )
+        self._embed_bot = tf_layers.Embed(
+            self.config[EMBED_DIM], self.config[C2], "bot", self.config[SIMILARITY_TYPE]
+        )
+
+    def set_training_phase(self, training: bool) -> None:
+        if training:
+            self.training = tf.ones((), tf.bool)
+        else:
+            self.training = tf.zeros((), tf.bool)
+
+    def _create_tf_dial(self, a_in: tf.Tensor):
+        """Create dialogue level embedding and mask."""
+
+        # mask different length sequences
+        # if there is at least one `-1` it should be masked
+        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+
+        a = self._ffnn_pre_dial(a_in, self.training)
+        a = self._transformer(a, mask, self.training)
+
+        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+            # pick last label if max history featurizer is used
+            a = a[:, -1:, :]
+            mask = mask[:, -1:]
+
+        dial_embed = self._embed_dial(a)
+
+        return dial_embed, mask
+
+    def _create_tf_bot_embed(self, b_in: tf.Tensor):
+        b = self._ffnn_bot(b_in, self.training)
+        return self._embed_bot(b)
+
+    def _train_losses_scores(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> None:
+        a_in, b_in, _ = batch_in
+
+        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+            # add time dimension if max history featurizer is used
+            b_in = b_in[:, tf.newaxis, :]
+
+        all_bot_raw = tf.constant(
+            self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
+        )
+
+        dial_embed, mask = self._create_tf_dial(self.a_in)
+
+        bot_embed = self._create_tf_bot_embed(self.b_in)
+        self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
+
+        loss, acc = self._loss_label(
+            dial_embed, bot_embed, b_in, self.all_bot_embed, all_bot_raw, mask
+        )
+
+        self.metric_loss.update_state(loss)
+        self.metric_acc.update_state(acc)
+
+    def build_for_predict(self) -> None:
+        all_bot_raw = tf.constant(
+            self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
+        )
+        self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
+
+    def predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+    ) -> tf.Tensor:
+        a_in, b_in, _ = batch_in
+
+        dial_embed, mask = self._create_tf_dial(a_in)
+
+        sim_all = self._loss_label.sim(
+            dial_embed[:, :, tf.newaxis, :],
+            self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
+            mask,
+        )
+
+        return train_utils.confidence_from_sim(sim_all, self.config[SIMILARITY_TYPE])
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 32466e78b887..0ec6a801c6b8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -208,7 +208,6 @@ def __init__(
         model: Optional[tf_models.RasaModel] = None,
         predict_func: Optional[Callable] = None,
         batch_tuple_sizes: Optional[Dict] = None,
-        attention_weights: Optional[tf.Tensor] = None,
     ) -> None:
         """Declare instance variables with default values"""
 
@@ -229,16 +228,9 @@ def __init__(
         # keep the input tuple sizes in self.batch_in
         self.batch_tuple_sizes = batch_tuple_sizes
 
-        # internal tf instances
-        self._iterator = None
-        self._train_op = None
-        self._is_training = None
-
         # number of entity tags
         self.num_tags = 0
 
-        self.attention_weights = attention_weights
-
         self._tf_config = train_utils.load_tf_config(self.component_config)
 
         self.data_example = None

From 2869066a8cdfe69863e25ab3b81d07bc50a340f0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 10:59:04 +0100
Subject: [PATCH 171/633] review comments

---
 .../classifiers/embedding_intent_classifier.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4a554dfaeadf..e659c7c49f07 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -163,14 +163,14 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+        EVAL_NUM_EXAMPLES: 10,  # large values may hurt performance
         # model config
         # if true intent classification is trained and intent predicted
         INTENT_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
         ENTITY_RECOGNITION: True,
-        MASKED_LM: False,
-        SPARSE_INPUT_DROPOUT: False,
+        MASKED_LM: True,
+        SPARSE_INPUT_DROPOUT: True,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -951,16 +951,16 @@ def _prepare_mask_lm_layers(self) -> None:
         self._tf_layers["embed.lm_mask"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "text_mask",
+            "lm_mask",
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.golden_token"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "text_token",
+            "golden_token",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["loss_mask"] = tf_layers.DotProductLoss(
+        self._tf_layers["loss.mask"] = tf_layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -983,7 +983,7 @@ def _prepare_intent_classification_layers(self) -> None:
             "label",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["loss_label"] = tf_layers.DotProductLoss(
+        self._tf_layers["loss.label"] = tf_layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -1080,7 +1080,7 @@ def _mask_loss(
         a_t_masked_embed = self._tf_layers["embed.lm_mask"](a_t_masked)
         a_masked_embed = self._tf_layers["embed.golden_token"](a_masked)
 
-        return self._tf_layers["loss_mask"](
+        return self._tf_layers["loss.mask"](
             a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
         )
 
@@ -1100,7 +1100,7 @@ def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
 
-        return self._tf_layers["loss_label"](
+        return self._tf_layers["loss.label"](
             a_embed, b_embed, b, all_labels_embed, all_labels
         )
 

From 51d84715160a61937809d922142fd31327368c04 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 10:59:27 +0100
Subject: [PATCH 172/633] reset defaut parameters

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e659c7c49f07..50e9365321c6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -163,14 +163,14 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 10,  # large values may hurt performance
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
         # model config
         # if true intent classification is trained and intent predicted
         INTENT_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
         ENTITY_RECOGNITION: True,
-        MASKED_LM: True,
-        SPARSE_INPUT_DROPOUT: True,
+        MASKED_LM: False,
+        SPARSE_INPUT_DROPOUT: False,
     }
     # end default properties (DOC MARKER - don't remove)
 

From b6fe3d92474c10ccdea2dd095cc24f3418aeb43c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 11:49:38 +0100
Subject: [PATCH 173/633] Update EmbeddingPolicy (not yet working)

---
 rasa/core/policies/embedding_policy.py        | 467 ++++++------------
 .../embedding_intent_classifier.py            |  10 +-
 2 files changed, 147 insertions(+), 330 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 1f5e0f979e0f..03685c6db6aa 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1,9 +1,7 @@
 import copy
-import json
 import logging
 import os
 import pickle
-import warnings
 
 import numpy as np
 from typing import Any, List, Optional, Text, Dict, Tuple, Union, Callable
@@ -24,10 +22,9 @@
 import tensorflow as tf
 
 # avoid warning println on contrib import - remove for tf 2
-from utils.tensorflow import tf_models, tf_layers
-from utils.train_utils import TrainingMetrics
+from rasa.utils.tensorflow import tf_models, tf_layers
+from rasa.utils.tensorflow.tf_model_data import RasaModelData
 
-tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
 
@@ -165,6 +162,8 @@ def __init__(
         # encode all label_ids with numbers
         self._encoded_all_label_ids = None
 
+        self.data_example = None
+
         self._tf_config = train_utils.load_tf_config(self.config)
 
     # init helpers
@@ -184,13 +183,13 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
     # data helpers
     # noinspection PyPep8Naming
     @staticmethod
-    def _label_ids_for_Y(data_Y: "np.ndarray") -> "np.ndarray":
+    def _label_ids_for_Y(data_Y: np.ndarray) -> np.ndarray:
         """Prepare Y data for training: extract label_ids."""
 
         return data_Y.argmax(axis=-1)
 
     # noinspection PyPep8Naming
-    def _label_features_for_Y(self, label_ids: "np.ndarray") -> "np.ndarray":
+    def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
         """Prepare Y data for training: features for label_ids."""
 
         if len(label_ids.shape) == 2:  # full dialogue featurizer is used
@@ -211,9 +210,9 @@ def _label_features_for_Y(self, label_ids: "np.ndarray") -> "np.ndarray":
             )
 
     # noinspection PyPep8Naming
-    def _create_session_data(
-        self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
-    ) -> "train_utils.SessionDataType":
+    def _create_model_data(
+        self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
+    ) -> RasaModelData:
         """Combine all tf session related data into dict."""
         if data_Y is not None:
             # training time
@@ -227,144 +226,15 @@ def _create_session_data(
             label_ids = None
             Y = None
 
-        return {
-            "dialogue_features": [data_X],
-            "bot_features": [Y],
-            "action_ids": [label_ids],
-        }
-
-    def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
-        """Create embedding bot vector."""
-
-        b = train_utils.create_tf_fnn(
-            b_in,
-            self.hidden_layers_sizes["bot"],
-            self.droprate["bot"],
-            self.C2,
-            self._is_training,
-            layer_name_suffix="bot",
-        )
-        return train_utils.create_tf_embed(
-            b, self.embed_dim, self.C2, "bot", self.similarity_type
-        )
-
-    def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        """Create dialogue level embedding and mask."""
-
-        # mask different length sequences
-        # if there is at least one `-1` it should be masked
-        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
-
-        a = train_utils.create_tf_fnn(
-            a_in,
-            self.hidden_layers_sizes["pre_dial"],
-            self.droprate["dial"],
-            self.C2,
-            self._is_training,
-            layer_name_suffix="pre_dial",
-        )
-
-        self.attention_weights = {}
-        hparams = train_utils.create_t2t_hparams(
-            self.num_transformer_layers,
-            self.transformer_size,
-            self.num_heads,
-            self.droprate["dial"],
-            self.pos_encoding,
-            self.max_seq_length,
-            self._is_training,
-        )
-
-        a = train_utils.create_t2t_transformer_encoder(
-            a, mask, self.attention_weights, hparams, self.C2, self._is_training
-        )
-
-        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-            # pick last label if max history featurizer is used
-            a = a[:, -1:, :]
-            mask = mask[:, -1:]
-
-        dial_embed = train_utils.create_tf_embed(
-            a, self.embed_dim, self.C2, "dial", self.similarity_type
-        )
-
-        return dial_embed, mask
-
-    def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        """Bulid train graph using iterator."""
-        # iterator returns a_in, b_in, action_ids
-        self.a_in, self.b_in, _ = self._iterator.get_next()
-
-        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-            # add time dimension if max history featurizer is used
-            self.b_in = self.b_in[:, tf.newaxis, :]
-
-        all_bot_raw = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
-        )
-
-        self.dial_embed, mask = self._create_tf_dial(self.a_in)
-
-        self.bot_embed = self._create_tf_bot_embed(self.b_in)
-        self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
-
-        return train_utils.calculate_loss_acc(
-            self.dial_embed,
-            self.bot_embed,
-            self.b_in,
-            self.all_bot_embed,
-            all_bot_raw,
-            self.num_neg,
-            mask,
-            self.loss_type,
-            self.mu_pos,
-            self.mu_neg,
-            self.use_max_sim_neg,
-            self.C_emb,
-            self.scale_loss,
+        return RasaModelData(
+            label_key="action_ids",
+            data={
+                "dialogue_features": [data_X],
+                "bot_features": [Y],
+                "action_ids": [label_ids],
+            },
         )
 
-    # prepare for prediction
-    def _create_tf_placeholders(
-        self, session_data: "train_utils.SessionDataType"
-    ) -> None:
-        """Create placeholders for prediction."""
-
-        dialogue_len = None  # use dynamic time
-        self.a_in = tf.placeholder(
-            dtype=tf.float32,
-            shape=(None, dialogue_len, session_data["dialogue_features"][0].shape[-1]),
-            name="a",
-        )
-        self.b_in = tf.placeholder(
-            dtype=tf.float32,
-            shape=(None, dialogue_len, None, session_data["bot_features"][0].shape[-1]),
-            name="b",
-        )
-
-    def _build_tf_pred_graph(
-        self, session_data: "train_utils.SessionDataType"
-    ) -> "tf.Tensor":
-        """Rebuild tf graph for prediction."""
-
-        self._create_tf_placeholders(session_data)
-
-        self.dial_embed, mask = self._create_tf_dial(self.a_in)
-
-        self.sim_all = train_utils.tf_raw_sim(
-            self.dial_embed[:, :, tf.newaxis, :],
-            self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
-            mask,
-        )
-
-        self.bot_embed = self._create_tf_bot_embed(self.b_in)
-
-        self.sim = train_utils.tf_raw_sim(
-            self.dial_embed[:, :, tf.newaxis, :], self.bot_embed, mask
-        )
-
-        return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
-
     # training methods
     def train(
         self,
@@ -377,7 +247,7 @@ def train(
         logger.debug("Started training embedding policy.")
 
         # set numpy random seed
-        np.random.seed(self.random_seed)
+        np.random.seed(self.config[RANDOM_SEED])
 
         # dealing with training data
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
@@ -393,73 +263,32 @@ def train(
             "Check if num_neg {} is smaller "
             "than number of label_ids {}, "
             "else set num_neg to the number of label_ids - 1"
-            "".format(self.num_neg, domain.num_actions)
+            "".format(self.config[NUM_NEG], domain.num_actions)
         )
         # noinspection PyAttributeOutsideInit
-        self.num_neg = min(self.num_neg, domain.num_actions - 1)
+        self.config[NUM_NEG] = min(self.config[NUM_NEG], domain.num_actions - 1)
 
         # extract actual training data to feed to tf session
-        session_data = self._create_session_data(training_data.X, training_data.y)
-
-        if self.evaluate_on_num_examples:
-            session_data, eval_session_data = train_utils.train_val_split(
-                session_data,
-                self.evaluate_on_num_examples,
-                self.random_seed,
-                label_key="action_ids",
-            )
-        else:
-            eval_session_data = None
-
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed in tf
-            tf.set_random_seed(self.random_seed)
-
-            # allows increasing batch size
-            batch_size_in = tf.placeholder(tf.int64)
-
-            (
-                self._iterator,
-                train_init_op,
-                eval_init_op,
-            ) = train_utils.create_iterator_init_datasets(
-                session_data,
-                eval_session_data,
-                batch_size_in,
-                self.batch_strategy,
-                label_key="action_ids",
-            )
+        model_data = self._create_model_data(training_data.X, training_data.y)
 
-            self._is_training = tf.placeholder_with_default(False, shape=())
-
-            loss, acc = self._build_tf_train_graph()
-
-            # define which optimizer to use
-            self._train_op = tf.train.AdamOptimizer().minimize(loss)
-
-            # train tensorflow graph
-            self.session = tf.Session(config=self._tf_config)
-            train_utils.train_tf_dataset(
-                train_init_op,
-                eval_init_op,
-                batch_size_in,
-                TrainingMetrics(loss={"loss": loss}, score={"acc": acc}),
-                self._train_op,
-                self.session,
-                self._is_training,
-                self.epochs,
-                self.batch_size,
-                self.evaluate_on_num_examples,
-                self.evaluate_every_num_epochs,
-            )
+        # keep one example for persisting and loading
+        self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
-            # rebuild the graph for prediction
-            self.pred_confidence = self._build_tf_pred_graph(session_data)
+        self.model = TED(
+            self.config,
+            isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
+            self._encoded_all_label_ids,
+        )
 
-            self.attention_weights = train_utils.extract_attention(
-                self.attention_weights
-            )
+        self.model.fit(
+            model_data,
+            self.config[EPOCHS],
+            self.config[BATCH_SIZES],
+            self.config[EVAL_NUM_EXAMPLES],
+            self.config[EVAL_NUM_EPOCHS],
+            batch_strategy=self.config[BATCH_STRATEGY],
+            random_seed=self.config[RANDOM_SEED],
+        )
 
     def continue_training(
         self,
@@ -472,41 +301,21 @@ def continue_training(
         batch_size = kwargs.get("batch_size", 5)
         epochs = kwargs.get("epochs", 50)
 
-        with self.graph.as_default():
-            for _ in range(epochs):
-                training_data = self._training_data_for_continue_training(
-                    batch_size, training_trackers, domain
-                )
-
-                session_data = self._create_session_data(
-                    training_data.X, training_data.y
-                )
-                train_dataset = train_utils.create_tf_dataset(
-                    session_data, batch_size, label_key="action_ids"
-                )
-                train_init_op = self._iterator.make_initializer(train_dataset)
-                self.session.run(train_init_op)
-
-                # fit to one extra example using updated trackers
-                while True:
-                    try:
-                        self.session.run(
-                            self._train_op, feed_dict={self._is_training: True}
-                        )
-
-                    except tf.errors.OutOfRangeError:
-                        break
-
-    def tf_feed_dict_for_prediction(
-        self, tracker: "DialogueStateTracker", domain: "Domain"
-    ) -> Dict["tf.Tensor", "np.ndarray"]:
-        """Create feed dictionary for tf session."""
+        training_data = self._training_data_for_continue_training(
+            batch_size, training_trackers, domain
+        )
 
-        # noinspection PyPep8Naming
-        data_X = self.featurizer.create_X([tracker], domain)
-        session_data = self._create_session_data(data_X)
+        model_data = self._create_model_data(training_data.X, training_data.y)
 
-        return {self.a_in: session_data["dialogue_features"][0]}
+        self.model.fit(
+            model_data,
+            epochs,
+            [batch_size],
+            self.config[EVAL_NUM_EXAMPLES],
+            self.config[EVAL_NUM_EPOCHS],
+            batch_strategy=self.config[BATCH_STRATEGY],
+            random_seed=self.config[RANDOM_SEED],
+        )
 
     def predict_action_probabilities(
         self, tracker: "DialogueStateTracker", domain: "Domain"
@@ -515,66 +324,51 @@ def predict_action_probabilities(
 
         Return the list of probabilities for the next actions.
         """
-
-        if self.session is None:
-            logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
-                "didn't receive enough training data"
-            )
+        if self.model is None or self.predict_func is None:
             return [0.0] * domain.num_actions
 
-        tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain)
+        # create model data from message and convert it into a batch of 1
+        data_X = self.featurizer.create_X([tracker], domain)
+        model_data = self._create_model_data(data_X)
+        predict_dataset = model_data.as_tf_dataset(1)
+        batch_in = next(iter(predict_dataset))
 
-        confidence = self.session.run(self.pred_confidence, feed_dict=tf_feed_dict)
+        confidence = self.predict_func(batch_in)
 
         return confidence[0, -1, :].tolist()
 
-    def persist(self, path: Text) -> None:
+    def persist(self, path: Text):
         """Persists the policy to a storage."""
 
-        if self.session is None:
-            warnings.warn(
-                "Method `persist(...)` was called "
-                "without a trained model present. "
-                "Nothing to persist then!"
-            )
+        if self.model is None:
             return
 
-        self.featurizer.persist(path)
+        file_name = "embedding_policy"
+        tf_model_file = os.path.join(path, f"{file_name}.tf_model")
 
-        meta = {"priority": self.priority}
+        rasa.utils.io.create_directory_for_file(tf_model_file)
 
-        meta_file = os.path.join(path, "embedding_policy.json")
-        rasa.utils.io.dump_obj_as_json_to_file(meta_file, meta)
+        self.featurizer.persist(path)
 
-        file_name = "tensorflow_embedding.ckpt"
-        checkpoint = os.path.join(path, file_name)
-        rasa.utils.io.create_directory_for_file(checkpoint)
+        self.model.save_weights(tf_model_file, save_format="tf")
 
-        with self.graph.as_default():
-            train_utils.persist_tensor("user_placeholder", self.a_in, self.graph)
-            train_utils.persist_tensor("bot_placeholder", self.b_in, self.graph)
+        with open(os.path.join(path, file_name + ".tf_config.pkl"), "wb") as f:
+            pickle.dump(self._tf_config, f)
 
-            train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
-            train_utils.persist_tensor(
-                "pred_confidence", self.pred_confidence, self.graph
-            )
-            train_utils.persist_tensor("similarity", self.sim, self.graph)
+        self.config["priority"] = self.priority
 
-            train_utils.persist_tensor("dial_embed", self.dial_embed, self.graph)
-            train_utils.persist_tensor("bot_embed", self.bot_embed, self.graph)
-            train_utils.persist_tensor("all_bot_embed", self.all_bot_embed, self.graph)
+        with open(os.path.join(path, file_name + ".meta.pkl"), "wb") as f:
+            pickle.dump(self.config, f)
 
-            train_utils.persist_tensor(
-                "attention_weights", self.attention_weights, self.graph
-            )
+        with open(os.path.join(path, file_name + ".data_example.pkl"), "wb") as f:
+            pickle.dump(self.data_example, f)
 
-            saver = tf.train.Saver()
-            saver.save(self.session, checkpoint)
+        with open(
+            os.path.join(path, file_name + ".encoded_all_label_ids.pkl"), "wb"
+        ) as f:
+            pickle.dump(self._encoded_all_label_ids, f)
 
-        with open(os.path.join(path, file_name + ".tf_config.pkl"), "wb") as f:
-            pickle.dump(self._tf_config, f)
+        return {"file": file_name}
 
     @classmethod
     def load(cls, path: Text) -> "EmbeddingPolicy":
@@ -589,62 +383,91 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
                 "doesn't exist".format(os.path.abspath(path))
             )
 
-        featurizer = TrackerFeaturizer.load(path)
+        file_name = "embedding_policy"
+        tf_model_file = os.path.join(path, f"{file_name}.tf_model")
 
-        file_name = "tensorflow_embedding.ckpt"
-        checkpoint = os.path.join(path, file_name)
+        featurizer = TrackerFeaturizer.load(path)
 
-        if not os.path.exists(checkpoint + ".meta"):
+        if not os.path.exists(tf_model_file + ".meta"):
             return cls(featurizer=featurizer)
 
-        meta_file = os.path.join(path, "embedding_policy.json")
-        meta = json.loads(rasa.utils.io.read_file(meta_file))
-
-        with open(os.path.join(path, file_name + ".tf_config.pkl"), "rb") as f:
-            _tf_config = pickle.load(f)
+        with open(os.path.join(path, file_name + ".data_example.pkl"), "rb") as f:
+            model_data_example = RasaModelData(
+                label_key="actions_ids", data=pickle.load(f)
+            )
 
-        graph = tf.Graph()
-        with graph.as_default():
-            session = tf.Session(config=_tf_config)
-            saver = tf.train.import_meta_graph(checkpoint + ".meta")
+        with open(
+            os.path.join(path, file_name + ".encoded_all_label_ids.pkl"), "rb"
+        ) as f:
+            encoded_all_label_ids = pickle.load(f)
 
-            saver.restore(session, checkpoint)
+        with open(os.path.join(path, file_name + ".meta.pkl"), "rb") as f:
+            meta = pickle.load(f)
 
-            a_in = train_utils.load_tensor("user_placeholder")
-            b_in = train_utils.load_tensor("bot_placeholder")
+        if meta[SIMILARITY_TYPE] == "auto":
+            if meta[LOSS_TYPE] == "softmax":
+                meta[SIMILARITY_TYPE] = "inner"
+            elif meta[LOSS_TYPE] == "margin":
+                meta[SIMILARITY_TYPE] = "cosine"
 
-            sim_all = train_utils.load_tensor("similarity_all")
-            pred_confidence = train_utils.load_tensor("pred_confidence")
-            sim = train_utils.load_tensor("similarity")
+        model = TED(
+            meta,
+            isinstance(featurizer, MaxHistoryTrackerFeaturizer),
+            encoded_all_label_ids,
+        )
 
-            dial_embed = train_utils.load_tensor("dial_embed")
-            bot_embed = train_utils.load_tensor("bot_embed")
-            all_bot_embed = train_utils.load_tensor("all_bot_embed")
+        logger.debug("Loading the model ...")
+        model.fit(
+            model_data_example,
+            1,
+            1,
+            0,
+            0,
+            batch_strategy=meta[BATCH_STRATEGY],
+            silent=True,  # don't confuse users with training output
+            eager=True,  # no need to build tf graph, eager is faster here
+        )
+        model.load_weights(tf_model_file)
 
-            attention_weights = train_utils.load_tensor("attention_weights")
+        # build the graph for prediction
+        model.set_training_phase(False)
+        model_data = RasaModelData(
+            label_key="actions_ids",
+            data={k: vs for k, vs in model_data_example.items() if "text" in k},
+        )
+        model.data_signature = model_data.get_signature()
+        model.build_for_predict(model_data)
+        predict_dataset = model_data.as_tf_dataset(
+            1, batch_strategy="sequence", shuffle=False
+        )
+        predict_func = tf.function(
+            func=model.predict, input_signature=[predict_dataset.element_spec]
+        )
+        batch_in = next(iter(predict_dataset))
+        predict_func(batch_in)
+        logger.debug("Finished loading the model.")
 
         return cls(
-            featurizer=featurizer,
+            component_config=meta,
             priority=meta["priority"],
-            graph=graph,
-            session=session,
-            user_placeholder=a_in,
-            bot_placeholder=b_in,
-            similarity_all=sim_all,
-            pred_confidence=pred_confidence,
-            similarity=sim,
-            dial_embed=dial_embed,
-            bot_embed=bot_embed,
-            all_bot_embed=all_bot_embed,
-            attention_weights=attention_weights,
+            model=model,
+            predict_func=predict_func,
         )
 
 
 class TED(tf_models.RasaModel):
-    def __init__(self, config: Dict[Text, Any]):
+    def __init__(
+        self,
+        config: Dict[Text, Any],
+        max_history_tracker_featurizer_used: bool,
+        encoded_all_label_ids: np.ndarray,
+    ):
         super().__init__()
 
         self.config = config
+        self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
+
+        self._encoded_all_label_ids = encoded_all_label_ids
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
@@ -707,12 +530,12 @@ def _create_tf_dial(self, a_in: tf.Tensor):
 
         # mask different length sequences
         # if there is at least one `-1` it should be masked
-        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
+        mask = tf.sign(tf.reduce_max(a_in, -1) + 1)
 
         a = self._ffnn_pre_dial(a_in, self.training)
         a = self._transformer(a, mask, self.training)
 
-        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+        if self.max_history_tracker_featurizer_used:
             # pick last label if max history featurizer is used
             a = a[:, -1:, :]
             mask = mask[:, -1:]
@@ -730,7 +553,7 @@ def _train_losses_scores(
     ) -> None:
         a_in, b_in, _ = batch_in
 
-        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
+        if self.max_history_tracker_featurizer_used:
             # add time dimension if max history featurizer is used
             b_in = b_in[:, tf.newaxis, :]
 
@@ -738,9 +561,9 @@ def _train_losses_scores(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
         )
 
-        dial_embed, mask = self._create_tf_dial(self.a_in)
+        dial_embed, mask = self._create_tf_dial(a_in)
 
-        bot_embed = self._create_tf_bot_embed(self.b_in)
+        bot_embed = self._create_tf_bot_embed(b_in)
         self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
 
         loss, acc = self._loss_label(
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 0b879554d9ef..0336e7d9ef9b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -9,6 +9,7 @@
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, Callable
 
+import rasa.utils.io
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -710,14 +711,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
 
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        try:
-            os.makedirs(os.path.dirname(tf_model_file))
-        except OSError as e:
-            # be happy if someone already created the path
-            import errno
-
-            if e.errno != errno.EEXIST:
-                raise
+        rasa.utils.io.create_directory_for_file(tf_model_file)
 
         self.model.save_weights(tf_model_file, save_format="tf")
 

From fc2e39359365bb559672ea044355006f294c4ddd Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 23 Jan 2020 13:56:14 +0100
Subject: [PATCH 174/633] updated version of tf text in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 09e28173c17f..9f7e27330c41 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,7 @@
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
-    "convert": ["tensorflow_text~=1.15.1", "tensorflow_hub~=0.6.0"],
+    "convert": ["tensorflow_text~=2.1.0rc0", "tensorflow_hub~=0.7.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],

From a44d660e998ef4563a12b5092dde3bce1cf8e812 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 23 Jan 2020 14:09:02 +0100
Subject: [PATCH 175/633] fix loss

---
 .../embedding_intent_classifier.py            | 21 +++--
 rasa/utils/tensorflow/tf_models.py            | 83 ++++++++-----------
 2 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 50e9365321c6..c772fc114882 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -875,12 +875,16 @@ def __init__(
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(config[LEARNING_RATE])
-        self.intent_acc = tf.keras.metrics.Mean(name="i_acc")
-        self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
+        # self.metrics preserve order
+        # output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
-        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
+        # output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.intent_acc = tf.keras.metrics.Mean(name="i_acc")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
+
         self._update_metrics_to_log()
 
         # persist
@@ -1130,9 +1134,9 @@ def _entity_loss(
 
         return loss, f1
 
-    def _train_losses_scores(
+    def batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> None:
+    ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
@@ -1142,10 +1146,13 @@ def _train_losses_scores(
             tf_batch_data["text_features"], mask_text, "text", self.config[MASKED_LM]
         )
 
+        losses = []
+
         if self.config[MASKED_LM]:
             loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text)
             self.mask_loss.update_state(loss)
             self.mask_acc.update_state(acc)
+            losses.append(loss)
 
         if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
@@ -1161,6 +1168,7 @@ def _train_losses_scores(
             loss, acc = self._intent_loss(cls, label)
             self.intent_loss.update_state(loss)
             self.intent_acc.update_state(acc)
+            losses.append(loss)
 
         if self.config[ENTITY_RECOGNITION]:
             tags = tf_batch_data["tag_ids"][0]
@@ -1170,6 +1178,9 @@ def _train_losses_scores(
             )
             self.entity_loss.update_state(loss)
             self.entity_f1.update_state(f1)
+            losses.append(loss)
+
+        return tf.math.add_n(losses)
 
     def build_for_predict(self, model_data: RasaModelData) -> None:
         self.batch_tuple_sizes = model_data.batch_tuple_sizes()
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index d66fcec7fb65..daca15408dc9 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -20,6 +20,16 @@ def __init__(self, *args, **kwargs):
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
         self.metrics_to_log = ["t_loss"]
 
+    def batch_loss(
+            self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> tf.Tensor:
+        raise NotImplementedError
+
+    def predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+    ) -> Dict[Text, tf.Tensor]:
+        raise NotImplementedError
+
     def fit(
         self,
         model_data: RasaModelData,
@@ -70,7 +80,7 @@ def fit(
                 ep, batch_size, epochs
             )
 
-            self._reset_metrics()
+            self.reset_metrics()
 
             # Train on batches
             self.set_training_phase(True)
@@ -164,28 +174,13 @@ def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]
             if metric.name in self.metrics_to_log
         }
 
-    def _reset_metrics(self) -> None:
-        # Reset the metrics
-        for metric in self.metrics:
-            metric.reset_states()
-
-    def _get_losses_from_metrics(self) -> List[tf.Tensor]:
-        return list(
-            [
-                m.result()
-                for m in self.metrics
-                if "loss" in m.name.lower() and m.name in self.metrics_to_log
-            ]
-        )
-
     def train_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
         with tf.GradientTape() as tape:
-            self._train_losses_scores(batch_in)
+            prediction_loss = self.batch_loss(batch_in)
             regularization_loss = tf.math.add_n(self.losses)
-            pred_loss = tf.math.add_n(self._get_losses_from_metrics())
-            total_loss = pred_loss + regularization_loss
+            total_loss = prediction_loss + regularization_loss
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
@@ -195,39 +190,12 @@ def train_on_batch(
     def evaluate_on_batch(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> None:
-        self._train_losses_scores(batch_in)
+        prediction_loss = self.batch_loss(batch_in)
         regularization_loss = tf.math.add_n(self.losses)
-        pred_loss = tf.math.add_n(self._get_losses_from_metrics())
-        total_loss = pred_loss + regularization_loss
+        total_loss = prediction_loss + regularization_loss
 
         self.total_loss.update_state(total_loss)
 
-    def compile(self, **kwargs) -> None:
-        raise NotImplementedError
-
-    def evaluate(self, **kwargs) -> None:
-        pass
-
-    def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
-    ) -> Dict[Text, tf.Tensor]:
-        pass
-
-    def test_on_batch(self, **kwargs) -> None:
-        raise NotImplementedError
-
-    def predict_on_batch(self, **kwargs) -> None:
-        raise NotImplementedError
-
-    def fit_generator(self, **kwargs) -> None:
-        raise NotImplementedError
-
-    def evaluate_generator(self, **kwargs) -> None:
-        raise NotImplementedError
-
-    def predict_generator(self, **kwargs) -> None:
-        raise NotImplementedError
-
     @staticmethod
     def _should_evaluate(
         evaluate_every_num_epochs: int, epochs: int, current_epoch: int
@@ -291,3 +259,24 @@ def linearly_increasing_batch_size(
             )
         else:
             return int(batch_size[0])
+
+    def compile(self, **kwargs) -> None:
+        raise NotImplemented
+
+    def evaluate(self, **kwargs) -> None:
+        raise NotImplemented
+
+    def test_on_batch(self, **kwargs) -> None:
+        raise NotImplemented
+
+    def predict_on_batch(self, **kwargs) -> None:
+        raise NotImplemented
+
+    def fit_generator(self, **kwargs) -> None:
+        raise NotImplemented
+
+    def evaluate_generator(self, **kwargs) -> None:
+        raise NotImplemented
+
+    def predict_generator(self, **kwargs) -> None:
+        raise NotImplemented

From 10997ef44c55705cb22f82eae16dd530a6c50ea4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 23 Jan 2020 14:12:24 +0100
Subject: [PATCH 176/633] black

---
 rasa/utils/tensorflow/tf_model_data.py | 13 ++++++++-----
 rasa/utils/tensorflow/tf_models.py     |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index a08f4daf90d8..9ad0ff0eb413 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -9,6 +9,7 @@
 
 Data = Optional[Dict[Text, List[np.ndarray]]]
 
+
 class FeatureSignature(NamedTuple):
     is_sparse: bool
     shape: List[int]
@@ -16,9 +17,7 @@ class FeatureSignature(NamedTuple):
 
 class RasaModelData:
     def __init__(
-        self,
-        label_key: Optional[Text] = None,
-        data: Data = None,
+        self, label_key: Optional[Text] = None, data: Data = None,
     ):
         self.data = data or {}
         self.label_key = label_key or ""
@@ -346,7 +345,9 @@ def _gen_batch(
         if batch_strategy == "balanced":
             data = self.balanced_data(data, batch_size, shuffle)
 
-        num_batches = self.num_examples // batch_size + int(self.num_examples % batch_size > 0)
+        num_batches = self.num_examples // batch_size + int(
+            self.num_examples % batch_size > 0
+        )
 
         for batch_num in range(num_batches):
             start = batch_num * batch_size
@@ -389,7 +390,9 @@ def _split_by_label_ids(
         label_data = []
         for label_id in unique_label_ids:
             ids = label_ids == label_id
-            label_data.append(RasaModelData(self.label_key, self._data_for_ids(data, ids)))
+            label_data.append(
+                RasaModelData(self.label_key, self._data_for_ids(data, ids))
+            )
         return label_data
 
     def _check_label_key(self, label_key: Text):
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index daca15408dc9..ed9ad4f2aa01 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -21,7 +21,7 @@ def __init__(self, *args, **kwargs):
         self.metrics_to_log = ["t_loss"]
 
     def batch_loss(
-            self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> tf.Tensor:
         raise NotImplementedError
 

From 502bdc81268daf902e03b622ad8dfe9cbb7ea3e0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 14:20:33 +0100
Subject: [PATCH 177/633] no errors during training

---
 rasa/core/policies/embedding_policy.py | 4 +++-
 rasa/utils/tensorflow/tf_models.py     | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 03685c6db6aa..f93aee0622c0 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -469,6 +469,8 @@ def __init__(
 
         self._encoded_all_label_ids = encoded_all_label_ids
 
+        self._optimizer = tf.keras.optimizers.Adam()
+
         # tf tensors
         self.training = tf.ones((), tf.bool)
 
@@ -533,7 +535,7 @@ def _create_tf_dial(self, a_in: tf.Tensor):
         mask = tf.sign(tf.reduce_max(a_in, -1) + 1)
 
         a = self._ffnn_pre_dial(a_in, self.training)
-        a = self._transformer(a, mask, self.training)
+        a = self._transformer(a, tf.expand_dims(mask, axis=-1), self.training)
 
         if self.max_history_tracker_featurizer_used:
             # pick last label if max history featurizer is used
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index d66fcec7fb65..8babda5ec9db 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -29,7 +29,7 @@ def fit(
         evaluate_every_num_epochs: int,
         batch_strategy: Text,
         silent: bool = False,
-        eager: bool = False,
+        eager: bool = True,
         random_seed: Optional[int] = None,
         **kwargs,
     ) -> None:

From 9b271b1ea6ed47b018b4e34f2ab95a8a98a6e700 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 14:28:21 +0100
Subject: [PATCH 178/633] report correct metrics

---
 rasa/core/policies/embedding_policy.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index f93aee0622c0..44310eaa0f72 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -90,7 +90,7 @@ class EmbeddingPolicy(Policy):
         # how to create batches
         BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
         # number of epochs
-        EPOCHS: 1,
+        EPOCHS: 10,
         # set random seed to any int to get reproducible results
         RANDOM_SEED: None,
         # embedding parameters
@@ -479,6 +479,7 @@ def __init__(
 
         self.metric_loss = tf.keras.metrics.Mean(name="loss")
         self.metric_acc = tf.keras.metrics.Mean(name="acc")
+        self.metrics_to_log = ["loss", "acc"]
 
         self._loss_label = tf_layers.DotProductLoss(
             self.config[NUM_NEG],
@@ -550,9 +551,9 @@ def _create_tf_bot_embed(self, b_in: tf.Tensor):
         b = self._ffnn_bot(b_in, self.training)
         return self._embed_bot(b)
 
-    def _train_losses_scores(
+    def batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> None:
+    ) -> tf.Tensor:
         a_in, b_in, _ = batch_in
 
         if self.max_history_tracker_featurizer_used:
@@ -575,6 +576,8 @@ def _train_losses_scores(
         self.metric_loss.update_state(loss)
         self.metric_acc.update_state(acc)
 
+        return loss
+
     def build_for_predict(self) -> None:
         all_bot_raw = tf.constant(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"

From 0b30ba3eba97d0bf207a64013e7d04a8f033bce4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 23 Jan 2020 15:09:34 +0100
Subject: [PATCH 179/633] refactor RasaModel

---
 .../embedding_intent_classifier.py            |  14 +-
 rasa/utils/tensorflow/tf_models.py            | 201 ++++++++++--------
 2 files changed, 118 insertions(+), 97 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c772fc114882..e3540b01cec1 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -875,6 +875,14 @@ def __init__(
 
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(config[LEARNING_RATE])
+        self._create_metrics()
+        self._update_metrics_to_log()
+
+        # persist
+        self.all_labels_embed = None
+        self.batch_tuple_sizes = None
+
+    def _create_metrics(self):
         # self.metrics preserve order
         # output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
@@ -885,12 +893,6 @@ def __init__(
         self.intent_acc = tf.keras.metrics.Mean(name="i_acc")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
 
-        self._update_metrics_to_log()
-
-        # persist
-        self.all_labels_embed = None
-        self.batch_tuple_sizes = None
-
     def _update_metrics_to_log(self) -> None:
         if self.config[MASKED_LM]:
             self.metrics_to_log += ["m_loss", "m_acc"]
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index ed9ad4f2aa01..bb108405b0a1 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -43,60 +43,58 @@ def fit(
         random_seed: Optional[int] = None,
         **kwargs,
     ) -> None:
-        """Train tf graph"""
+        """Fit model data"""
+
+        disable = silent or is_logging_disabled()
 
         evaluation_model_data = None
         if evaluate_on_num_examples > 0:
-            logger.info(
-                f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
-                f"epochs."
-            )
+            if not disable:
+                logger.info(
+                    f"Validation accuracy is calculated every "
+                    f"{evaluate_every_num_epochs} epochs."
+                )
 
             model_data, evaluation_model_data = model_data.split(
                 evaluate_on_num_examples, random_seed
             )
 
-        disable = silent or is_logging_disabled()
-
-        tf_batch_size = tf.ones((), tf.int32)
         (
             tf_train_dataset_function,
             tf_train_on_batch_function,
-        ) = self._get_tf_train_functions(
-            eager, model_data, batch_strategy, tf_batch_size
-        )
+        ) = self._get_tf_train_functions(eager, model_data, batch_strategy)
 
         (
             tf_evaluation_dataset_function,
             tf_evaluation_on_batch_function,
         ) = self._get_tf_evaluation_functions(
-            eager, evaluate_on_num_examples, evaluation_model_data, tf_batch_size
+            eager, evaluate_on_num_examples, evaluation_model_data
         )
 
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
         for ep in pbar:
-            ep_batch_size = tf_batch_size * self.linearly_increasing_batch_size(
-                ep, batch_size, epochs
+            ep_batch_size = self.linearly_increasing_batch_size(ep, batch_size, epochs)
+            if not eager:
+                ep_batch_size *= tf.ones((), tf.int32)
+
+            self._batch_loop(
+                tf_train_dataset_function,
+                tf_train_on_batch_function,
+                ep_batch_size,
+                True,
             )
 
-            self.reset_metrics()
-
-            # Train on batches
-            self.set_training_phase(True)
-            for batch_in in tf_train_dataset_function(ep_batch_size):
-                tf_train_on_batch_function(batch_in)
-
             postfix_dict = self._get_metric_results()
 
             if evaluate_on_num_examples > 0:
                 if self._should_evaluate(evaluate_every_num_epochs, epochs, ep):
-                    self._reset_metrics()
-
-                    # Eval on batches
-                    self.set_training_phase(False)
-                    for batch_in in tf_evaluation_dataset_function(ep_batch_size):
-                        tf_evaluation_on_batch_function(batch_in)
+                    self._batch_loop(
+                        tf_evaluation_dataset_function,
+                        tf_evaluation_on_batch_function,
+                        ep_batch_size,
+                        False,
+                    )
 
                 # Get the metric results
                 postfix_dict.update(self._get_metric_results(prefix="val_"))
@@ -106,96 +104,117 @@ def fit(
         if not disable:
             logger.info("Finished training.")
 
-    def _get_tf_train_functions(
+    def train_on_batch(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+    ) -> None:
+        """Train on batch"""
+
+        with tf.GradientTape() as tape:
+            total_loss = self._total_batch_loss(batch_in)
+
+        gradients = tape.gradient(total_loss, self.trainable_variables)
+        self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+
+    def _total_batch_loss(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> tf.Tensor:
+        """Calculate total loss"""
+
+        prediction_loss = self.batch_loss(batch_in)
+        regularization_loss = tf.math.add_n(self.losses)
+        total_loss = prediction_loss + regularization_loss
+        self.total_loss.update_state(total_loss)
+
+        return total_loss
+
+    def _batch_loop(
         self,
+        dataset_function: Callable,
+        method_function: Callable,
+        batch_size: Union[tf.Tensor, int],
+        training: bool,
+    ) -> None:
+        """Run on batches"""
+
+        self.reset_metrics()
+        self.set_training_phase(training)
+        for batch_in in dataset_function(batch_size):
+            method_function(batch_in)
+
+    @staticmethod
+    def _get_tf_functions(
+        dataset_function: Callable,
+        method_function: Callable,
         eager: bool,
-        model_data: RasaModelData,
-        batch_strategy: Text,
-        tf_batch_size: tf.Tensor,
+        method: Text,
     ) -> Tuple[Callable, Callable]:
-        def train_dataset_function(_batch_size):
-            return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
+        """Convert functions to tensorflow functions"""
 
         if eager:
-            tf_train_dataset_function = train_dataset_function
-            tf_train_on_batch_function = self.train_on_batch
-        else:
-            logger.debug("Building tensorflow train graph...")
-            # allows increasing batch size
-            tf_train_dataset_function = tf.function(func=train_dataset_function)
-            init_dataset = tf_train_dataset_function(tf_batch_size)
-            tf_train_on_batch_function = tf.function(
-                self.train_on_batch, input_signature=[init_dataset.element_spec]
-            )
-            tf_train_on_batch_function(next(iter(init_dataset)))
-            logger.debug("Finished building tensorflow train graph")
+            return dataset_function, method_function
 
-        return tf_train_dataset_function, tf_train_on_batch_function
+        logger.debug(f"Building tensorflow {method} graph...")
+        # allows increasing batch size
+        tf_dataset_function = tf.function(func=dataset_function)
+
+        init_dataset = tf_dataset_function(tf.ones((), tf.int32))
+
+        tf_method_function = tf.function(
+            method_function, input_signature=[init_dataset.element_spec]
+        )
+        tf_method_function(next(iter(init_dataset)))
+
+        logger.debug(f"Finished building tensorflow {method} graph")
+
+        return tf_dataset_function, tf_method_function
+
+    def _get_tf_train_functions(
+        self, eager: bool, model_data: RasaModelData, batch_strategy: Text,
+    ) -> Tuple[Callable, Callable]:
+        """Create train tensorflow functions"""
+
+        def train_dataset_function(_batch_size):
+            return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
+
+        return self._get_tf_functions(
+            train_dataset_function, self.train_on_batch, eager, "train"
+        )
 
     def _get_tf_evaluation_functions(
         self,
         eager: bool,
         evaluate_on_num_examples: int,
         evaluation_model_data: RasaModelData,
-        tf_batch_size: tf.Tensor,
-    ) -> Tuple[Callable, Callable]:
-        def evaluation_dataset_function(_batch_size):
-            return evaluation_model_data.as_tf_dataset(
-                _batch_size, "sequence", shuffle=False
-            )
+    ) -> Tuple[Optional[Callable], Optional[Callable]]:
+        """Create evaluation tensorflow functions"""
 
         if evaluate_on_num_examples > 0:
-            if eager:
-                tf_evaluation_dataset_function = evaluation_dataset_function
-                tf_evaluation_on_batch_function = self.evaluate_on_batch
-            else:
-                tf_evaluation_dataset_function = tf.function(
-                    func=evaluation_dataset_function
-                )
-                tf_evaluation_on_batch_function = tf.function(
-                    self.evaluate_on_batch,
-                    input_signature=[
-                        tf_evaluation_dataset_function(tf_batch_size).element_spec
-                    ],
+
+            def evaluation_dataset_function(_batch_size):
+                return evaluation_model_data.as_tf_dataset(
+                    _batch_size, "sequence", shuffle=False
                 )
-        else:
-            tf_evaluation_dataset_function = None
-            tf_evaluation_on_batch_function = None
 
-        return tf_evaluation_dataset_function, tf_evaluation_on_batch_function
+            return self._get_tf_functions(
+                evaluation_dataset_function,
+                self._total_batch_loss,
+                eager,
+                "evaluation",
+            )
+
+        return None, None
 
     def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]:
+        """Get the metrics results"""
+
         prefix = prefix or ""
 
-        # Get the metric results
         return {
             f"{prefix}{metric.name}": f"{metric.result().numpy():.3f}"
             for metric in self.metrics
             if metric.name in self.metrics_to_log
         }
 
-    def train_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
-    ) -> None:
-        with tf.GradientTape() as tape:
-            prediction_loss = self.batch_loss(batch_in)
-            regularization_loss = tf.math.add_n(self.losses)
-            total_loss = prediction_loss + regularization_loss
-
-        gradients = tape.gradient(total_loss, self.trainable_variables)
-        self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-
-        self.total_loss.update_state(total_loss)
-
-    def evaluate_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
-    ) -> None:
-        prediction_loss = self.batch_loss(batch_in)
-        regularization_loss = tf.math.add_n(self.losses)
-        total_loss = prediction_loss + regularization_loss
-
-        self.total_loss.update_state(total_loss)
-
     @staticmethod
     def _should_evaluate(
         evaluate_every_num_epochs: int, epochs: int, current_epoch: int

From e698d7a46ffa0a2528a8fa3f400daa992a49deb0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 15:37:18 +0100
Subject: [PATCH 180/633] clean up

---
 rasa/core/policies/embedding_policy.py | 111 +++++++++++++------------
 1 file changed, 59 insertions(+), 52 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 44310eaa0f72..ae405f180e9d 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -4,6 +4,8 @@
 import pickle
 
 import numpy as np
+import tensorflow as tf
+
 from typing import Any, List, Optional, Text, Dict, Tuple, Union, Callable
 
 import rasa.utils.io
@@ -18,13 +20,10 @@
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils import train_utils
-
-import tensorflow as tf
-
-# avoid warning println on contrib import - remove for tf 2
 from rasa.utils.tensorflow import tf_models, tf_layers
 from rasa.utils.tensorflow.tf_model_data import RasaModelData
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -131,7 +130,7 @@ class EmbeddingPolicy(Policy):
     # end default properties (DOC MARKER - don't remove)
 
     @staticmethod
-    def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturizer":
+    def _standard_featurizer(max_history: Optional[int] = None) -> TrackerFeaturizer:
         if max_history is None:
             return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
         else:
@@ -141,7 +140,7 @@ def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturize
 
     def __init__(
         self,
-        featurizer: Optional["TrackerFeaturizer"] = None,
+        featurizer: Optional[TrackerFeaturizer] = None,
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
         model: Optional[tf_models.RasaModel] = None,
@@ -152,6 +151,7 @@ def __init__(
 
         if not featurizer:
             featurizer = self._standard_featurizer(max_history)
+
         super().__init__(featurizer, priority)
 
         self._load_params(**kwargs)
@@ -166,7 +166,6 @@ def __init__(
 
         self._tf_config = train_utils.load_tf_config(self.config)
 
-    # init helpers
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
         self.config.update(kwargs)
@@ -214,6 +213,7 @@ def _create_model_data(
         self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
     ) -> RasaModelData:
         """Combine all tf session related data into dict."""
+
         if data_Y is not None:
             # training time
             label_ids = self._label_ids_for_Y(data_Y)
@@ -238,8 +238,8 @@ def _create_model_data(
     # training methods
     def train(
         self,
-        training_trackers: List["DialogueStateTracker"],
-        domain: "Domain",
+        training_trackers: List[DialogueStateTracker],
+        domain: Domain,
         **kwargs: Any,
     ) -> None:
         """Train the policy on given training trackers."""
@@ -265,10 +265,9 @@ def train(
             "else set num_neg to the number of label_ids - 1"
             "".format(self.config[NUM_NEG], domain.num_actions)
         )
-        # noinspection PyAttributeOutsideInit
         self.config[NUM_NEG] = min(self.config[NUM_NEG], domain.num_actions - 1)
 
-        # extract actual training data to feed to tf session
+        # extract actual training data to feed to model
         model_data = self._create_model_data(training_data.X, training_data.y)
 
         # keep one example for persisting and loading
@@ -292,8 +291,8 @@ def train(
 
     def continue_training(
         self,
-        training_trackers: List["DialogueStateTracker"],
-        domain: "Domain",
+        training_trackers: List[DialogueStateTracker],
+        domain: Domain,
         **kwargs: Any,
     ) -> None:
         """Continue training an already trained policy."""
@@ -318,7 +317,7 @@ def continue_training(
         )
 
     def predict_action_probabilities(
-        self, tracker: "DialogueStateTracker", domain: "Domain"
+        self, tracker: DialogueStateTracker, domain: Domain
     ) -> List[float]:
         """Predict the next action the bot should take.
 
@@ -393,7 +392,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         with open(os.path.join(path, file_name + ".data_example.pkl"), "rb") as f:
             model_data_example = RasaModelData(
-                label_key="actions_ids", data=pickle.load(f)
+                label_key="action_ids", data=pickle.load(f)
             )
 
         with open(
@@ -432,11 +431,11 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         # build the graph for prediction
         model.set_training_phase(False)
         model_data = RasaModelData(
-            label_key="actions_ids",
+            label_key="action_ids",
             data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
         model.data_signature = model_data.get_signature()
-        model.build_for_predict(model_data)
+        model.build_for_predict()
         predict_dataset = model_data.as_tf_dataset(
             1, batch_strategy="sequence", shuffle=False
         )
@@ -466,22 +465,28 @@ def __init__(
 
         self.config = config
         self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
-
         self._encoded_all_label_ids = encoded_all_label_ids
 
+        # optimizer
         self._optimizer = tf.keras.optimizers.Adam()
 
         # tf tensors
         self.training = tf.ones((), tf.bool)
 
         # persist
-        self.all_bot_embed = None
+        self.all_label_embed = None
 
+        # metrics
         self.metric_loss = tf.keras.metrics.Mean(name="loss")
         self.metric_acc = tf.keras.metrics.Mean(name="acc")
         self.metrics_to_log = ["loss", "acc"]
 
-        self._loss_label = tf_layers.DotProductLoss(
+        # set up tf layers
+        self._tf_layers = {}
+        self._prepare_layers()
+
+    def _prepare_layers(self) -> None:
+        self._tf_layers["loss.label"] = tf_layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -490,19 +495,19 @@ def __init__(
             self.config[C_EMB],
             self.config[SCALE_LOSS],
         )
-        self._ffnn_pre_dial = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.dial"] = tf_layers.ReluFfn(
             self.config[HIDDEN_LAYERS_SIZES_PRE_DIAL],
             self.config[DROPRATE_DIAL],
             self.config[C2],
             layer_name_suffix="pre_dial",
         )
-        self._ffnn_bot = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.bot"] = tf_layers.ReluFfn(
             self.config[HIDDEN_LAYERS_SIZES_BOT],
             self.config[DROPRATE_BOT],
             self.config[C2],
             layer_name_suffix="bot",
         )
-        self._transformer = tf_layers.TransformerEncoder(
+        self._tf_layers["transformer"] = tf_layers.TransformerEncoder(
             self.config[NUM_TRANSFORMER_LAYERS],
             self.config[TRANSFORMER_SIZE],
             self.config[NUM_HEADS],
@@ -512,13 +517,13 @@ def __init__(
             self.config[DROPRATE_DIAL],
             name="dial_encoder",
         )
-        self._embed_dial = tf_layers.Embed(
+        self._tf_layers["embed.dial"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "dial",
             self.config[SIMILARITY_TYPE],
         )
-        self._embed_bot = tf_layers.Embed(
+        self._tf_layers["embed.bot"] = tf_layers.Embed(
             self.config[EMBED_DIM], self.config[C2], "bot", self.config[SIMILARITY_TYPE]
         )
 
@@ -528,49 +533,51 @@ def set_training_phase(self, training: bool) -> None:
         else:
             self.training = tf.zeros((), tf.bool)
 
-    def _create_tf_dial(self, a_in: tf.Tensor):
+    def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
 
         # mask different length sequences
         # if there is at least one `-1` it should be masked
-        mask = tf.sign(tf.reduce_max(a_in, -1) + 1)
+        mask = tf.sign(tf.reduce_max(dialogue_in, -1) + 1)
 
-        a = self._ffnn_pre_dial(a_in, self.training)
-        a = self._transformer(a, tf.expand_dims(mask, axis=-1), self.training)
+        dialogue = self._tf_layers["ffnn.dial"](dialogue_in, self.training)
+        dialogue_transformed = self._tf_layers["transformer"](
+            dialogue, tf.expand_dims(mask, axis=-1), self.training
+        )
 
         if self.max_history_tracker_featurizer_used:
             # pick last label if max history featurizer is used
-            a = a[:, -1:, :]
+            dialogue_transformed = dialogue_transformed[:, -1:, :]
             mask = mask[:, -1:]
 
-        dial_embed = self._embed_dial(a)
+        dialogue_embed = self._tf_layers["embed.dial"](dialogue_transformed)
 
-        return dial_embed, mask
+        return dialogue_embed, mask
 
-    def _create_tf_bot_embed(self, b_in: tf.Tensor):
-        b = self._ffnn_bot(b_in, self.training)
-        return self._embed_bot(b)
+    def _embed_label(self, label_in: tf.Tensor) -> tf.Tensor:
+        label = self._tf_layers["ffnn.bot"](label_in, self.training)
+        return self._tf_layers["embed.bot"](label)
 
     def batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> tf.Tensor:
-        a_in, b_in, _ = batch_in
+        dialogue_in, label_in, _ = batch_in
 
         if self.max_history_tracker_featurizer_used:
             # add time dimension if max history featurizer is used
-            b_in = b_in[:, tf.newaxis, :]
+            label_in = label_in[:, tf.newaxis, :]
 
-        all_bot_raw = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
+        all_label = tf.constant(
+            self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
         )
 
-        dial_embed, mask = self._create_tf_dial(a_in)
+        dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
 
-        bot_embed = self._create_tf_bot_embed(b_in)
-        self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
+        label_embed = self._embed_label(label_in)
+        self.all_label_embed = self._embed_label(all_label)
 
-        loss, acc = self._loss_label(
-            dial_embed, bot_embed, b_in, self.all_bot_embed, all_bot_raw, mask
+        loss, acc = self._tf_layers["loss.label"](
+            dialogue_embed, label_embed, label_in, self.all_label_embed, all_label, mask
         )
 
         self.metric_loss.update_state(loss)
@@ -579,21 +586,21 @@ def batch_loss(
         return loss
 
     def build_for_predict(self) -> None:
-        all_bot_raw = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
+        all_label_raw = tf.constant(
+            self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
         )
-        self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
+        self.all_label_embed = self._embed_label(all_label_raw)
 
     def predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> tf.Tensor:
-        a_in, b_in, _ = batch_in
+        dialogue_in, label_in, _ = batch_in
 
-        dial_embed, mask = self._create_tf_dial(a_in)
+        dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
 
-        sim_all = self._loss_label.sim(
-            dial_embed[:, :, tf.newaxis, :],
-            self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
+        sim_all = self._tf_layers["loss.label"].sim(
+            dialogue_embed[:, :, tf.newaxis, :],
+            self.all_label_embed[tf.newaxis, tf.newaxis, :, :],
             mask,
         )
 

From 3dce0855052801f3cc1da929d8fd3de2cf640ee7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 15:48:20 +0100
Subject: [PATCH 181/633] naming

---
 rasa/core/policies/embedding_policy.py | 31 ++++++++++++++------------
 rasa/utils/tensorflow/tf_models.py     |  2 +-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index ae405f180e9d..8ed66fb75b20 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -89,7 +89,7 @@ class EmbeddingPolicy(Policy):
         # how to create batches
         BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
         # number of epochs
-        EPOCHS: 10,
+        EPOCHS: 1,
         # set random seed to any int to get reproducible results
         RANDOM_SEED: None,
         # embedding parameters
@@ -495,17 +495,17 @@ def _prepare_layers(self) -> None:
             self.config[C_EMB],
             self.config[SCALE_LOSS],
         )
-        self._tf_layers["ffnn.dial"] = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.dialogue"] = tf_layers.ReluFfn(
             self.config[HIDDEN_LAYERS_SIZES_PRE_DIAL],
             self.config[DROPRATE_DIAL],
             self.config[C2],
-            layer_name_suffix="pre_dial",
+            layer_name_suffix="dialogue",
         )
-        self._tf_layers["ffnn.bot"] = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.label"] = tf_layers.ReluFfn(
             self.config[HIDDEN_LAYERS_SIZES_BOT],
             self.config[DROPRATE_BOT],
             self.config[C2],
-            layer_name_suffix="bot",
+            layer_name_suffix="label",
         )
         self._tf_layers["transformer"] = tf_layers.TransformerEncoder(
             self.config[NUM_TRANSFORMER_LAYERS],
@@ -515,16 +515,19 @@ def _prepare_layers(self) -> None:
             self.config[MAX_SEQ_LENGTH],
             self.config[C2],
             self.config[DROPRATE_DIAL],
-            name="dial_encoder",
+            name="dialogue_encoder",
         )
-        self._tf_layers["embed.dial"] = tf_layers.Embed(
+        self._tf_layers["embed.dialogue"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "dial",
+            "dialogue",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed.bot"] = tf_layers.Embed(
-            self.config[EMBED_DIM], self.config[C2], "bot", self.config[SIMILARITY_TYPE]
+        self._tf_layers["embed.label"] = tf_layers.Embed(
+            self.config[EMBED_DIM],
+            self.config[C2],
+            "label",
+            self.config[SIMILARITY_TYPE],
         )
 
     def set_training_phase(self, training: bool) -> None:
@@ -540,7 +543,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(dialogue_in, -1) + 1)
 
-        dialogue = self._tf_layers["ffnn.dial"](dialogue_in, self.training)
+        dialogue = self._tf_layers["ffnn.dialogue"](dialogue_in, self.training)
         dialogue_transformed = self._tf_layers["transformer"](
             dialogue, tf.expand_dims(mask, axis=-1), self.training
         )
@@ -550,13 +553,13 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
             dialogue_transformed = dialogue_transformed[:, -1:, :]
             mask = mask[:, -1:]
 
-        dialogue_embed = self._tf_layers["embed.dial"](dialogue_transformed)
+        dialogue_embed = self._tf_layers["embed.dialogue"](dialogue_transformed)
 
         return dialogue_embed, mask
 
     def _embed_label(self, label_in: tf.Tensor) -> tf.Tensor:
-        label = self._tf_layers["ffnn.bot"](label_in, self.training)
-        return self._tf_layers["embed.bot"](label)
+        label = self._tf_layers["ffnn.label"](label_in, self.training)
+        return self._tf_layers["embed.label"](label)
 
     def batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index b8e25e935efd..ed9ad4f2aa01 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -39,7 +39,7 @@ def fit(
         evaluate_every_num_epochs: int,
         batch_strategy: Text,
         silent: bool = False,
-        eager: bool = True,
+        eager: bool = False,
         random_seed: Optional[int] = None,
         **kwargs,
     ) -> None:

From 91ce2c9e14f231cfdc33de8824ed95a1f2b16c9c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 23 Jan 2020 15:49:15 +0100
Subject: [PATCH 182/633] refactor RasaModel and nlu

---
 .../embedding_intent_classifier.py            | 142 ++++++++----------
 rasa/utils/tensorflow/tf_layers.py            |   9 ++
 rasa/utils/tensorflow/tf_models.py            |   9 ++
 rasa/utils/train_utils.py                     |   9 --
 4 files changed, 81 insertions(+), 88 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e3540b01cec1..c9d68b3e0ec5 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -870,9 +870,6 @@ def __init__(
         # tf objects
         self._prepare_layers()
 
-        # tf tensors
-        self.training = tf.ones((), tf.bool)
-
         # tf training
         self._optimizer = tf.keras.optimizers.Adam(config[LEARNING_RATE])
         self._create_metrics()
@@ -908,6 +905,28 @@ def _prepare_layers(self) -> None:
         self._prepare_intent_classification_layers()
         self._prepare_entity_recognition_layers()
 
+    @staticmethod
+    def _create_sparse_dense_layer(
+        data_signature: List[FeatureSignature],
+        name: Text,
+        reg_lambda: float,
+        dense_dim: int,
+    ) -> Optional[tf_layers.DenseForSparse]:
+
+        sparse = False
+        for is_sparse, shape in data_signature:
+            if is_sparse:
+                sparse = is_sparse
+            else:
+                # if dense features are present
+                # use the feature dimension of the dense features
+                dense_dim = shape[-1]
+
+        if sparse:
+            return tf_layers.DenseForSparse(
+                units=dense_dim, reg_lambda=reg_lambda, name=name
+            )
+
     def _prepare_sequence_layers(self) -> None:
         self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
             rate=self.config[DROPRATE]
@@ -1009,11 +1028,10 @@ def _prepare_entity_recognition_layers(self) -> None:
             average="micro",
         )
 
-    def set_training_phase(self, training: bool) -> None:
-        if training:
-            self.training = tf.ones((), tf.bool)
-        else:
-            self.training = tf.zeros((), tf.bool)
+    @staticmethod
+    def _get_mask_and_lengths(mask):
+        sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
+        return mask, sequence_lengths
 
     def _combine_sparse_dense_features(
         self,
@@ -1028,7 +1046,7 @@ def _combine_sparse_dense_features(
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 if sparse_dropout:
-                    _f = self._tf_layers["sparse_dropout"](f, self.training)
+                    _f = self._tf_layers["sparse_dropout"](f, self._training)
                 else:
                     _f = f
 
@@ -1043,10 +1061,11 @@ def _create_bow(
         features: List[Union[tf.Tensor, "tf.SparseTensor"]],
         mask: tf.Tensor,
         name: Text,
+        sparse_dropout: bool = False,
     ) -> tf.Tensor:
 
-        x = self._combine_sparse_dense_features(features, mask, name)
-        return self._tf_layers[f"ffnn.{name}"](tf.reduce_sum(x, 1), self.training)
+        x = self._combine_sparse_dense_features(features, mask, name, sparse_dropout)
+        return self._tf_layers[f"ffnn.{name}"](tf.reduce_sum(x, 1), self._training)
 
     def _create_sequence(
         self,
@@ -1060,15 +1079,33 @@ def _create_sequence(
         )
 
         if masked_lm_loss:
-            pre, lm_mask_bool = self._tf_layers["input_mask"](x, mask, self.training)
+            pre, lm_mask_bool = self._tf_layers["input_mask"](x, mask, self._training)
         else:
             pre, lm_mask_bool = (x, None)
 
-        transformed = self._tf_layers["transformer"](pre, 1 - mask, self.training)
+        transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
         transformed = tf.nn.relu(transformed)
 
         return transformed, x, lm_mask_bool
 
+    @staticmethod
+    def _last_token(x, sequence_lengths):
+        last_index = tf.maximum(
+            tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
+        )
+        idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
+        return tf.gather_nd(x, idxs)
+
+    def _build_all_b(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_labels = self._create_bow(
+            self.tf_label_data["label_features"],
+            self.tf_label_data["label_mask"][0],
+            "label",
+        )
+        all_labels_embed = self._tf_layers["embed.label"](all_labels)
+
+        return all_labels_embed, all_labels
+
     def _mask_loss(
         self, a_transformed: tf.Tensor, a: tf.Tensor, lm_mask_bool: tf.Tensor
     ) -> tf.Tensor:
@@ -1090,16 +1127,6 @@ def _mask_loss(
             a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
         )
 
-    def _build_all_b(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_labels = self._create_bow(
-            self.tf_label_data["label_features"],
-            self.tf_label_data["label_mask"][0],
-            "label",
-        )
-        all_labels_embed = self._tf_layers["embed.label"](all_labels)
-
-        return all_labels_embed, all_labels
-
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         all_labels_embed, all_labels = self._build_all_b()
 
@@ -1141,9 +1168,9 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
-
+        mask_text, sequence_lengths = self._get_mask_and_lengths(
+            tf_batch_data["text_mask"][0]
+        )
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text", self.config[MASKED_LM]
         )
@@ -1158,11 +1185,7 @@ def batch_loss(
 
         if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
-            last_index = tf.maximum(
-                tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
-            )
-            idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
-            cls = tf.gather_nd(text_transformed, idxs)
+            cls = self._last_token(text_transformed, sequence_lengths)
 
             label = self._create_bow(
                 tf_batch_data["label_features"], tf_batch_data["label_mask"][0], "label"
@@ -1195,9 +1218,9 @@ def predict(
     ) -> Dict[Text, tf.Tensor]:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths = tf.cast(tf.reduce_sum(mask_text[:, :, 0], 1), tf.int32)
-
+        mask_text, sequence_lengths = self._get_mask_and_lengths(
+            tf_batch_data["text_mask"][0]
+        )
         text_transformed, _, _ = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text"
         )
@@ -1205,59 +1228,20 @@ def predict(
         out = {}
         if self.config[INTENT_CLASSIFICATION]:
             # get _cls_ vector for intent classification
-            last_index = tf.maximum(
-                tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
-            )
-            idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
-            cls = tf.gather_nd(text_transformed, idxs)
-            cls_embed = self._embed["text"](cls)
+            cls = self._last_token(text_transformed, sequence_lengths)
+            cls_embed = self._tf_layers["embed.text"](cls)
 
-            sim_all = self._loss_label.sim(
+            sim_all = self._tf_layers["loss.label"].sim(
                 cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
             )
-
-            scores = train_utils.confidence_from_sim(
+            scores = self._tf_layers["loss.label"].confidence_from_sim(
                 sim_all, self.config[SIMILARITY_TYPE]
             )
             out["i_scores"] = scores
 
         if self.config[ENTITY_RECOGNITION]:
-            logits = self._embed["logits"](text_transformed)
-            pred_ids = self._crf(logits, sequence_lengths - 1)
+            logits = self._tf_layers["embed.logits"](text_transformed)
+            pred_ids = self._tf_layers["crf"](logits, sequence_lengths - 1)
             out["e_ids"] = pred_ids
 
         return out
-
-    @staticmethod
-    def _create_sparse_dense_layer(
-        data_signature: List[FeatureSignature],
-        name: Text,
-        reg_lambda: float,
-        dense_dim: int,
-    ) -> Optional[tf_layers.DenseForSparse]:
-
-        sparse = False
-        for is_sparse, shape in data_signature:
-            if is_sparse:
-                sparse = is_sparse
-            else:
-                # if dense features are present
-                # use the feature dimension of the dense features
-                dense_dim = shape[-1]
-
-        if sparse:
-            return tf_layers.DenseForSparse(
-                units=dense_dim, reg_lambda=reg_lambda, name=name
-            )
-
-    @staticmethod
-    def _input_dim(data_signature: List[FeatureSignature], dense_dim: int) -> int:
-
-        for is_sparse, shape in data_signature:
-            if not is_sparse:
-                # if dense features are present
-                # use the feature dimension of the dense features
-                dense_dim = shape[-1]
-                break
-
-        return dense_dim * len(data_signature)
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index d10fde22e931..d60cc8fcc0e5 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -608,6 +608,15 @@ def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tens
 
         return sim
 
+    @staticmethod
+    def confidence_from_sim(sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
+        if similarity_type == "cosine":
+            # clip negative values to zero
+            return tf.nn.relu(sim)
+        else:
+            # normalize result to [0, 1] with softmax
+            return tf.nn.softmax(sim)
+
     def _train_sim(
         self,
         pos_inputs_embed: tf.Tensor,
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index bb108405b0a1..aca09ffee5aa 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -20,6 +20,9 @@ def __init__(self, *args, **kwargs):
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
         self.metrics_to_log = ["t_loss"]
 
+        self._training = tf.ones((), tf.bool)
+        self._optimizer = None
+
     def batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> tf.Tensor:
@@ -30,6 +33,12 @@ def predict(
     ) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
 
+    def set_training_phase(self, training: bool) -> None:
+        if training:
+            self._training = tf.ones((), tf.bool)
+        else:
+            self._training = tf.zeros((), tf.bool)
+
     def fit(
         self,
         model_data: RasaModelData,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index ecc7dd563510..696588ae9d34 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -21,15 +21,6 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
-    if similarity_type == "cosine":
-        # clip negative values to zero
-        return tf.nn.relu(sim)
-    else:
-        # normalize result to [0, 1] with softmax
-        return tf.nn.softmax(sim)
-
-
 def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
     """Extract attention probabilities from t2t dict"""
 

From 14a9e8c97eaef375309659bff979b7b9cd12114d Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Thu, 23 Jan 2020 16:07:51 +0100
Subject: [PATCH 183/633] Update
 rasa/nlu/classifiers/embedding_intent_classifier.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c9d68b3e0ec5..81f62349d0b6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1089,7 +1089,7 @@ def _create_sequence(
         return transformed, x, lm_mask_bool
 
     @staticmethod
-    def _last_token(x, sequence_lengths):
+    def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
         last_index = tf.maximum(
             tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
         )

From 10358369c6a0d0fb170b5e8c13e6085cdcbe0c8b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 23 Jan 2020 16:19:13 +0100
Subject: [PATCH 184/633] add types

---
 .../nlu/classifiers/embedding_intent_classifier.py | 12 +++++-------
 rasa/utils/tensorflow/tf_model_data.py             |  6 +++---
 rasa/utils/tensorflow/tf_models.py                 | 14 +++++++-------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 81f62349d0b6..2f9dc6d49bc1 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1029,7 +1029,7 @@ def _prepare_entity_recognition_layers(self) -> None:
         )
 
     @staticmethod
-    def _get_mask_and_lengths(mask):
+    def _get_mask_and_lengths(mask: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
         return mask, sequence_lengths
 
@@ -1090,13 +1090,11 @@ def _create_sequence(
 
     @staticmethod
     def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
-        last_index = tf.maximum(
-            tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1
-        )
+        last_index = tf.maximum(0, sequence_lengths - 1)
         idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
         return tf.gather_nd(x, idxs)
 
-    def _build_all_b(self) -> Tuple[tf.Tensor, tf.Tensor]:
+    def _build_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_labels = self._create_bow(
             self.tf_label_data["label_features"],
             self.tf_label_data["label_mask"][0],
@@ -1128,7 +1126,7 @@ def _mask_loss(
         )
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
-        all_labels_embed, all_labels = self._build_all_b()
+        all_labels_embed, all_labels = self._build_all_labels()
 
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
@@ -1210,7 +1208,7 @@ def batch_loss(
     def build_for_predict(self, model_data: RasaModelData) -> None:
         self.batch_tuple_sizes = model_data.batch_tuple_sizes()
 
-        all_labels_embed, _ = self._build_all_b()
+        all_labels_embed, _ = self._build_all_labels()
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
     def predict(
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index 9ad0ff0eb413..b9072be1bc06 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -282,10 +282,10 @@ def batch_tuple_sizes(self) -> Dict[Text, int]:
 
     def as_tf_dataset(
         self,
-        batch_size: Union["tf.Tensor", int],
+        batch_size: Union[tf.Tensor, int],
         batch_strategy: Text = "sequence",
         shuffle: bool = False,
-    ) -> "tf.data.Dataset":
+    ) -> tf.data.Dataset:
         """Create tf dataset."""
 
         shapes, types = self._get_shapes_types()
@@ -383,7 +383,7 @@ def _data_for_ids(data: Data, ids: np.ndarray) -> Dict[Text, List[np.ndarray]]:
         return new_data
 
     def _split_by_label_ids(
-        self, data: Data, label_ids: "np.ndarray", unique_label_ids: "np.ndarray"
+        self, data: Data, label_ids: np.ndarray, unique_label_ids: np.ndarray
     ) -> List["RasaModelData"]:
         """Reorganize session data into a list of session data with the same labels."""
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index aca09ffee5aa..e813eb3b1e14 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -139,7 +139,7 @@ def _total_batch_loss(
     def _batch_loop(
         self,
         dataset_function: Callable,
-        method_function: Callable,
+        call_model_function: Callable,
         batch_size: Union[tf.Tensor, int],
         training: bool,
     ) -> None:
@@ -148,19 +148,19 @@ def _batch_loop(
         self.reset_metrics()
         self.set_training_phase(training)
         for batch_in in dataset_function(batch_size):
-            method_function(batch_in)
+            call_model_function(batch_in)
 
     @staticmethod
     def _get_tf_functions(
         dataset_function: Callable,
-        method_function: Callable,
+        call_model_function: Callable,
         eager: bool,
         method: Text,
     ) -> Tuple[Callable, Callable]:
         """Convert functions to tensorflow functions"""
 
         if eager:
-            return dataset_function, method_function
+            return dataset_function, call_model_function
 
         logger.debug(f"Building tensorflow {method} graph...")
         # allows increasing batch size
@@ -169,7 +169,7 @@ def _get_tf_functions(
         init_dataset = tf_dataset_function(tf.ones((), tf.int32))
 
         tf_method_function = tf.function(
-            method_function, input_signature=[init_dataset.element_spec]
+            call_model_function, input_signature=[init_dataset.element_spec]
         )
         tf_method_function(next(iter(init_dataset)))
 
@@ -182,7 +182,7 @@ def _get_tf_train_functions(
     ) -> Tuple[Callable, Callable]:
         """Create train tensorflow functions"""
 
-        def train_dataset_function(_batch_size):
+        def train_dataset_function(_batch_size: tf.Tensor) -> tf.data.Dataset:
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
 
         return self._get_tf_functions(
@@ -199,7 +199,7 @@ def _get_tf_evaluation_functions(
 
         if evaluate_on_num_examples > 0:
 
-            def evaluation_dataset_function(_batch_size):
+            def evaluation_dataset_function(_batch_size: tf.Tensor) -> tf.data.Dataset:
                 return evaluation_model_data.as_tf_dataset(
                     _batch_size, "sequence", shuffle=False
                 )

From 3e12f80732353eecab30970743cbcb4eeb38c052 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 16:53:50 +0100
Subject: [PATCH 185/633] Update KerasPolicy

---
 rasa/core/policies/keras_policy.py | 118 ++++++++++++-----------------
 1 file changed, 50 insertions(+), 68 deletions(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 2ce9a1dfd32a..f53ce6b81681 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -5,7 +5,7 @@
 import tensorflow as tf
 import numpy as np
 import warnings
-from typing import Any, List, Dict, Text, Optional, Tuple
+from typing import Any, List, Dict, Text, Optional, Tuple, Union
 
 import rasa.utils.io
 
@@ -18,11 +18,13 @@
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils.common import obtain_verbosity
+from rasa.utils.tensorflow import tf_models
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 
 # there are a number of issues with imports from tensorflow. hence the deactivation
 # pytype: disable=import-error
 # pytype: disable=module-attr
+
 try:
     import cPickle as pickle
 except ImportError:
@@ -56,8 +58,6 @@ def __init__(
         featurizer: Optional[TrackerFeaturizer] = None,
         priority: int = DEFAULT_POLICY_PRIORITY,
         model: Optional[tf.keras.models.Sequential] = None,
-        graph: Optional[tf.Graph] = None,
-        session: Optional[tf.compat.v1.Session] = None,
         current_epoch: int = 0,
         max_history: Optional[int] = None,
         **kwargs: Any,
@@ -68,10 +68,6 @@ def __init__(
 
         self._load_params(**kwargs)
         self.model = model
-        # by default keras uses default tf graph and global tf session
-        # we are going to either load them or create them in train(...)
-        self.graph = graph
-        self.session = session
 
         self.current_epoch = current_epoch
 
@@ -178,41 +174,36 @@ def train(
         # noinspection PyPep8Naming
         shuffled_X, shuffled_y = training_data.shuffled_X_y()
 
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed in tf
-            tf.set_random_seed(self.random_seed)
-            self.session = tf.compat.v1.Session(config=self._tf_config)
-
-            with self.session.as_default():
-                if self.model is None:
-                    self.model = self.model_architecture(
-                        shuffled_X.shape[1:], shuffled_y.shape[1:]
-                    )
-
-                logger.info(
-                    "Fitting model with {} total samples and a "
-                    "validation split of {}"
-                    "".format(training_data.num_examples(), self.validation_split)
-                )
+        tf.random.set_seed(self.random_seed)
 
-                # filter out kwargs that cannot be passed to fit
-                self._train_params = self._get_valid_params(
-                    self.model.fit, **self._train_params
-                )
+        if self.model is None:
+            self.model = self.model_architecture(
+                shuffled_X.shape[1:], shuffled_y.shape[1:]
+            )
 
-                self.model.fit(
-                    shuffled_X,
-                    shuffled_y,
-                    epochs=self.epochs,
-                    batch_size=self.batch_size,
-                    shuffle=False,
-                    verbose=obtain_verbosity(),
-                    **self._train_params,
-                )
-                # the default parameter for epochs in keras fit is 1
-                self.current_epoch = self.defaults.get("epochs", 1)
-                logger.info("Done fitting keras policy model")
+        logger.info(
+            "Fitting model with {} total samples and a "
+            "validation split of {}"
+            "".format(training_data.num_examples(), self.validation_split)
+        )
+
+        # filter out kwargs that cannot be passed to fit
+        self._train_params = self._get_valid_params(
+            self.model.fit, **self._train_params
+        )
+
+        self.model.fit(
+            shuffled_X,
+            shuffled_y,
+            epochs=self.epochs,
+            batch_size=self.batch_size,
+            shuffle=False,
+            verbose=obtain_verbosity(),
+            **self._train_params,
+        )
+        # the default parameter for epochs in keras fit is 1
+        self.current_epoch = self.defaults.get("epochs", 1)
+        logger.info("Done fitting keras policy model")
 
     def continue_training(
         self,
@@ -231,23 +222,22 @@ def continue_training(
         batch_size = kwargs.get("batch_size", 5)
         epochs = kwargs.get("epochs", 50)
 
-        with self.graph.as_default(), self.session.as_default():
-            for _ in range(epochs):
-                training_data = self._training_data_for_continue_training(
-                    batch_size, training_trackers, domain
-                )
+        for _ in range(epochs):
+            training_data = self._training_data_for_continue_training(
+                batch_size, training_trackers, domain
+            )
 
-                # fit to one extra example using updated trackers
-                self.model.fit(
-                    training_data.X,
-                    training_data.y,
-                    epochs=self.current_epoch + 1,
-                    batch_size=len(training_data.y),
-                    verbose=obtain_verbosity(),
-                    initial_epoch=self.current_epoch,
-                )
+            # fit to one extra example using updated trackers
+            self.model.fit(
+                training_data.X,
+                training_data.y,
+                epochs=self.current_epoch + 1,
+                batch_size=len(training_data.y),
+                verbose=obtain_verbosity(),
+                initial_epoch=self.current_epoch,
+            )
 
-                self.current_epoch += 1
+            self.current_epoch += 1
 
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
@@ -256,8 +246,7 @@ def predict_action_probabilities(
         # noinspection PyPep8Naming
         X = self.featurizer.create_X([tracker], domain)
 
-        with self.graph.as_default(), self.session.as_default():
-            y_pred = self.model.predict(X, batch_size=1)
+        y_pred = self.model.predict(X, batch_size=1)
 
         if len(y_pred.shape) == 2:
             return y_pred[-1].tolist()
@@ -283,8 +272,7 @@ def persist(self, path: Text) -> None:
             model_file = os.path.join(path, meta["model"])
             # makes sure the model directory exists
             rasa.utils.io.create_directory_for_file(model_file)
-            with self.graph.as_default(), self.session.as_default():
-                self.model.save(model_file, overwrite=True)
+            self.model.save(model_file, overwrite=True)
 
             tf_config_file = os.path.join(path, "keras_policy.tf_config.pkl")
             with open(tf_config_file, "wb") as f:
@@ -312,20 +300,14 @@ def load(cls, path: Text) -> "KerasPolicy":
 
                 model_file = os.path.join(path, meta["model"])
 
-                graph = tf.Graph()
-                with graph.as_default():
-                    session = tf.compat.v1.Session(config=_tf_config)
-                    with session.as_default():
-                        with warnings.catch_warnings():
-                            warnings.simplefilter("ignore")
-                            model = load_model(model_file)
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    model = load_model(model_file)
 
                 return cls(
                     featurizer=featurizer,
                     priority=meta["priority"],
                     model=model,
-                    graph=graph,
-                    session=session,
                     current_epoch=meta["epochs"],
                 )
             else:

From 002989f93b85e31ff2ce235c13b393005c035c43 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 23 Jan 2020 17:04:36 +0100
Subject: [PATCH 186/633] clean up train_utils

---
 rasa/utils/train_utils.py | 52 +--------------------------------------
 1 file changed, 1 insertion(+), 51 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index ecc7dd563510..05a4588ff54d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,17 +1,11 @@
 import logging
-from typing import List, Optional, Text, Dict, Tuple, Union, Any, NamedTuple
+from typing import Optional, Text, Dict, Any
 import tensorflow as tf
 
 
 logger = logging.getLogger(__name__)
 
 
-# namedtuple for training metrics
-class TrainingMetrics(NamedTuple):
-    loss: Dict[Text, Union[tf.Tensor, float]]
-    score: Dict[Text, Union[tf.Tensor, float]]
-
-
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
     """Prepare `tf.compat.v1.ConfigProto` for training"""
 
@@ -28,47 +22,3 @@ def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
     else:
         # normalize result to [0, 1] with softmax
         return tf.nn.softmax(sim)
-
-
-def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
-    """Extract attention probabilities from t2t dict"""
-
-    attention = [
-        tf.expand_dims(t, 0)
-        for name, t in attention_weights.items()
-        # the strings come from t2t library
-        if "multihead_attention/dot_product" in name and not name.endswith("/logits")
-    ]
-
-    if attention:
-        return tf.concat(attention, 0)
-
-
-def persist_tensor(
-    name: Text,
-    tensor: Union["tf.Tensor", Tuple["tf.Tensor"], List["tf.Tensor"]],
-    graph: "tf.Graph",
-) -> None:
-    """Add tensor to collection if it is not None"""
-
-    if tensor is not None:
-        graph.clear_collection(name)
-        if isinstance(tensor, tuple) or isinstance(tensor, list):
-            for t in tensor:
-                graph.add_to_collection(name, t)
-        else:
-            graph.add_to_collection(name, tensor)
-
-
-def load_tensor(name: Text) -> Optional[Union["tf.Tensor", List["tf.Tensor"]]]:
-    """Load tensor or set it to None"""
-
-    tensor_list = tf.get_collection(name)
-
-    if not tensor_list:
-        return None
-
-    if len(tensor_list) == 1:
-        return tensor_list[0]
-
-    return tensor_list

From 6c1ccc74dcb285f9f0b1ba22bde73da7bebaf0f9 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 10:36:28 +0100
Subject: [PATCH 187/633] create RasaModel.save/load

---
 .../embedding_intent_classifier.py            | 34 +++++++------------
 rasa/utils/tensorflow/tf_models.py            | 31 +++++++++++++++--
 2 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2f9dc6d49bc1..63420f2b9c3c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -727,7 +727,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             if e.errno != errno.EEXIST:
                 raise
 
-        self.model.save_weights(tf_model_file, save_format="tf")
+        self.model.save(tf_model_file)
 
         with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "wb") as f:
             pickle.dump(self.data_example, f)
@@ -804,30 +804,25 @@ def load(
             elif meta[LOSS_TYPE] == "margin":
                 meta[SIMILARITY_TYPE] = "cosine"
 
-        model = DIET(model_data_example.get_signature(), label_data, inv_tag_dict, meta)
-
         logger.debug("Loading the model ...")
-        model.fit(
+        model = DIET.load(
+            tf_model_file,
             model_data_example,
-            1,
-            1,
-            0,
-            0,
-            batch_strategy=meta[BATCH_STRATEGY],
-            silent=True,  # don't confuse users with training output
-            eager=True,  # no need to build tf graph, eager is faster here
+            model_data_example.get_signature(),
+            label_data,
+            inv_tag_dict,
+            meta,
         )
-        model.load_weights(tf_model_file)
-
         # build the graph for prediction
         model.set_training_phase(False)
-        model_data = RasaModelData(
+        predict_data = RasaModelData(
             label_key="label_ids",
             data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
-        model.data_signature = model_data.get_signature()
-        model.build_for_predict(model_data)
-        predict_dataset = model_data.as_tf_dataset(
+        # override train signature with predict signature
+        model.data_signature = predict_data.get_signature()
+        model.build_for_predict()
+        predict_dataset = predict_data.as_tf_dataset(
             1, batch_strategy="sequence", shuffle=False
         )
         predict_func = tf.function(
@@ -877,7 +872,6 @@ def __init__(
 
         # persist
         self.all_labels_embed = None
-        self.batch_tuple_sizes = None
 
     def _create_metrics(self):
         # self.metrics preserve order
@@ -1205,9 +1199,7 @@ def batch_loss(
 
         return tf.math.add_n(losses)
 
-    def build_for_predict(self, model_data: RasaModelData) -> None:
-        self.batch_tuple_sizes = model_data.batch_tuple_sizes()
-
+    def build_for_predict(self) -> None:
         all_labels_embed, _ = self._build_all_labels()
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index e813eb3b1e14..09068f05e60e 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -124,6 +124,29 @@ def train_on_batch(
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
+    def save(self, model_file_name) -> None:
+        self.save_weights(model_file_name, save_format="tf")
+
+    @classmethod
+    def load(cls, model_file_name, model_data_example, *args, **kwargs):
+        # create empty model
+        model = cls(*args, **kwargs)
+        # need to train on 1 example to build weights of the correct size
+        model.fit(
+            model_data_example,
+            1,
+            1,
+            0,
+            0,
+            batch_strategy="sequence",
+            silent=True,  # don't confuse users with training output
+            eager=True,  # no need to build tf graph, eager is faster here
+        )
+        # load trained weights
+        model.load_weights(model_file_name)
+
+        return model
+
     def _total_batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> tf.Tensor:
@@ -182,7 +205,9 @@ def _get_tf_train_functions(
     ) -> Tuple[Callable, Callable]:
         """Create train tensorflow functions"""
 
-        def train_dataset_function(_batch_size: tf.Tensor) -> tf.data.Dataset:
+        def train_dataset_function(
+            _batch_size: Union[tf.Tensor, int]
+        ) -> tf.data.Dataset:
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
 
         return self._get_tf_functions(
@@ -199,7 +224,9 @@ def _get_tf_evaluation_functions(
 
         if evaluate_on_num_examples > 0:
 
-            def evaluation_dataset_function(_batch_size: tf.Tensor) -> tf.data.Dataset:
+            def evaluation_dataset_function(
+                _batch_size: Union[tf.Tensor, int]
+            ) -> tf.data.Dataset:
                 return evaluation_model_data.as_tf_dataset(
                     _batch_size, "sequence", shuffle=False
                 )

From 0e689245f862358ad8318cc55f5de553b6afd70e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 10:46:36 +0100
Subject: [PATCH 188/633] add types

---
 .../embedding_intent_classifier.py            |  6 ++--
 rasa/utils/tensorflow/tf_models.py            | 30 +++++++++++--------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 63420f2b9c3c..1865734321f2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -815,14 +815,14 @@ def load(
         )
         # build the graph for prediction
         model.set_training_phase(False)
-        predict_data = RasaModelData(
+        predict_data_example = RasaModelData(
             label_key="label_ids",
             data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
         # override train signature with predict signature
-        model.data_signature = predict_data.get_signature()
+        model.data_signature = predict_data_example.get_signature()
         model.build_for_predict()
-        predict_dataset = predict_data.as_tf_dataset(
+        predict_dataset = predict_data_example.as_tf_dataset(
             1, batch_strategy="sequence", shuffle=False
         )
         predict_func = tf.function(
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 09068f05e60e..fc2af632d462 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -12,7 +12,10 @@
 
 # noinspection PyMethodOverriding
 class RasaModel(tf.keras.models.Model):
-    """Completely override all public methods of keras Model."""
+    """Completely override all public methods of keras Model.
+
+    Cannot be used as tf.keras.Model
+    """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -29,7 +32,7 @@ def batch_loss(
         raise NotImplementedError
 
     def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
 
@@ -50,7 +53,6 @@ def fit(
         silent: bool = False,
         eager: bool = False,
         random_seed: Optional[int] = None,
-        **kwargs,
     ) -> None:
         """Fit model data"""
 
@@ -114,7 +116,7 @@ def fit(
             logger.info("Finished training.")
 
     def train_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> None:
         """Train on batch"""
 
@@ -124,11 +126,13 @@ def train_on_batch(
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
-    def save(self, model_file_name) -> None:
+    def save(self, model_file_name: Text) -> None:
         self.save_weights(model_file_name, save_format="tf")
 
     @classmethod
-    def load(cls, model_file_name, model_data_example, *args, **kwargs):
+    def load(
+        cls, model_file_name: Text, model_data_example: RasaModelData, *args, **kwargs
+    ) -> "RasaModel":
         # create empty model
         model = cls(*args, **kwargs)
         # need to train on 1 example to build weights of the correct size
@@ -315,23 +319,23 @@ def linearly_increasing_batch_size(
         else:
             return int(batch_size[0])
 
-    def compile(self, **kwargs) -> None:
+    def compile(self, *args, **kwargs) -> None:
         raise NotImplemented
 
-    def evaluate(self, **kwargs) -> None:
+    def evaluate(self, *args, **kwargs) -> None:
         raise NotImplemented
 
-    def test_on_batch(self, **kwargs) -> None:
+    def test_on_batch(self, *args, **kwargs) -> None:
         raise NotImplemented
 
-    def predict_on_batch(self, **kwargs) -> None:
+    def predict_on_batch(self, *args, **kwargs) -> None:
         raise NotImplemented
 
-    def fit_generator(self, **kwargs) -> None:
+    def fit_generator(self, *args, **kwargs) -> None:
         raise NotImplemented
 
-    def evaluate_generator(self, **kwargs) -> None:
+    def evaluate_generator(self, *args, **kwargs) -> None:
         raise NotImplemented
 
-    def predict_generator(self, **kwargs) -> None:
+    def predict_generator(self, *args, **kwargs) -> None:
         raise NotImplemented

From a0b48d41f1c571210b9ea5f217bc07925d085c20 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 11:33:18 +0100
Subject: [PATCH 189/633] testing works

---
 rasa/core/policies/embedding_policy.py | 59 +++++++++++++++-----------
 rasa/utils/train_utils.py              | 17 ++++++++
 2 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 8ed66fb75b20..b78033bf5752 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -53,6 +53,7 @@
 DROPRATE_BOT = "droprate_bot"
 EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
 EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
+RANKING_LENGTH = "ranking_length"
 
 
 class EmbeddingPolicy(Policy):
@@ -101,6 +102,9 @@ class EmbeddingPolicy(Policy):
         SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
         # the type of the loss function
         LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
+        # number of top actions to normalize scores for softmax loss_type
+        # set to 0 to turn off normalization
+        RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
         MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
@@ -223,17 +227,15 @@ def _create_model_data(
             label_ids = np.expand_dims(label_ids, -1)
         else:
             # prediction time
-            label_ids = None
-            Y = None
+            label_ids = np.array([])
+            Y = np.array([])
 
-        return RasaModelData(
-            label_key="action_ids",
-            data={
-                "dialogue_features": [data_X],
-                "bot_features": [Y],
-                "action_ids": [label_ids],
-            },
-        )
+        model_data = RasaModelData(label_key="action_ids")
+        model_data.add_features("dialogue_features", [data_X])
+        model_data.add_features("bot_features", [Y])
+        model_data.add_features("action_ids", [label_ids])
+
+        return model_data
 
     # training methods
     def train(
@@ -333,8 +335,12 @@ def predict_action_probabilities(
         batch_in = next(iter(predict_dataset))
 
         confidence = self.predict_func(batch_in)
+        confidence = confidence[0, -1, :].numpy()
 
-        return confidence[0, -1, :].tolist()
+        if self.config[LOSS_TYPE] == "softmax" and self.config[RANKING_LENGTH] > 0:
+            confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
+
+        return list(confidence)
 
     def persist(self, path: Text):
         """Persists the policy to a storage."""
@@ -387,9 +393,6 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         featurizer = TrackerFeaturizer.load(path)
 
-        if not os.path.exists(tf_model_file + ".meta"):
-            return cls(featurizer=featurizer)
-
         with open(os.path.join(path, file_name + ".data_example.pkl"), "rb") as f:
             model_data_example = RasaModelData(
                 label_key="action_ids", data=pickle.load(f)
@@ -432,9 +435,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         model.set_training_phase(False)
         model_data = RasaModelData(
             label_key="action_ids",
-            data={k: vs for k, vs in model_data_example.items() if "text" in k},
+            data={k: vs for k, vs in model_data_example.items() if "dialogue" in k},
         )
-        model.data_signature = model_data.get_signature()
         model.build_for_predict()
         predict_dataset = model_data.as_tf_dataset(
             1, batch_strategy="sequence", shuffle=False
@@ -447,6 +449,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         logger.debug("Finished loading the model.")
 
         return cls(
+            featurizer=featurizer,
             component_config=meta,
             priority=meta["priority"],
             model=model,
@@ -465,7 +468,6 @@ def __init__(
 
         self.config = config
         self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
-        self._encoded_all_label_ids = encoded_all_label_ids
 
         # optimizer
         self._optimizer = tf.keras.optimizers.Adam()
@@ -473,8 +475,8 @@ def __init__(
         # tf tensors
         self.training = tf.ones((), tf.bool)
 
-        # persist
-        self.all_label_embed = None
+        self.all_labels_embed = None
+        self._encoded_all_label_ids = encoded_all_label_ids
 
         # metrics
         self.metric_loss = tf.keras.metrics.Mean(name="loss")
@@ -577,10 +579,15 @@ def batch_loss(
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
 
         label_embed = self._embed_label(label_in)
-        self.all_label_embed = self._embed_label(all_label)
+        self.all_labels_embed = self._embed_label(all_label)
 
         loss, acc = self._tf_layers["loss.label"](
-            dialogue_embed, label_embed, label_in, self.all_label_embed, all_label, mask
+            dialogue_embed,
+            label_embed,
+            label_in,
+            self.all_labels_embed,
+            all_label,
+            mask,
         )
 
         self.metric_loss.update_state(loss)
@@ -589,21 +596,23 @@ def batch_loss(
         return loss
 
     def build_for_predict(self) -> None:
-        all_label_raw = tf.constant(
+        all_label = tf.constant(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
         )
-        self.all_label_embed = self._embed_label(all_label_raw)
+        all_labels_embed = self._embed_label(all_label)
+
+        self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
     def predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
     ) -> tf.Tensor:
-        dialogue_in, label_in, _ = batch_in
+        dialogue_in = batch_in[0]
 
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
 
         sim_all = self._tf_layers["loss.label"].sim(
             dialogue_embed[:, :, tf.newaxis, :],
-            self.all_label_embed[tf.newaxis, tf.newaxis, :, :],
+            self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
             mask,
         )
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 05a4588ff54d..9435f49ee936 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Optional, Text, Dict, Any
 import tensorflow as tf
+import numpy as np
 
 
 logger = logging.getLogger(__name__)
@@ -22,3 +23,19 @@ def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
     else:
         # normalize result to [0, 1] with softmax
         return tf.nn.softmax(sim)
+
+
+def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
+    """Normalizes an array of positive numbers over the top `ranking_length` values.
+    Other values will be set to 0.
+    """
+
+    new_values = values.copy()  # prevent mutation of the input
+    if 0 < ranking_length < len(new_values):
+        ranked = sorted(new_values, reverse=True)
+        new_values[new_values < ranked[ranking_length - 1]] = 0
+
+    if np.sum(new_values) > 0:
+        new_values = new_values / np.sum(new_values)
+
+    return new_values

From 269a74163d6f72b377470785813e9664237e5843 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 11:45:06 +0100
Subject: [PATCH 190/633] refactor RasaModel.predict

---
 .../embedding_intent_classifier.py            | 86 ++++++++-----------
 rasa/utils/tensorflow/tf_models.py            | 27 +++++-
 2 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 1865734321f2..240284d20b56 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -206,7 +206,6 @@ def __init__(
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         inverted_tag_dict: Optional[Dict[int, Text]] = None,
         model: Optional[tf_models.RasaModel] = None,
-        predict_func: Optional[Callable] = None,
         batch_tuple_sizes: Optional[Dict] = None,
         attention_weights: Optional[tf.Tensor] = None,
     ) -> None:
@@ -221,7 +220,6 @@ def __init__(
         self.inverted_tag_dict = inverted_tag_dict
 
         self.model = model
-        self.predict_func = predict_func
 
         # encode all label_ids with numbers
         self._label_data = None
@@ -563,10 +561,8 @@ def train(
         # set random seed
         tf.random.set_seed(self.component_config[RANDOM_SEED])
 
-        model_data_signature = model_data.get_signature()
-
         self.model = DIET(
-            model_data_signature,
+            model_data.get_signature(),
             self._label_data,
             self.inverted_tag_dict,
             self.component_config,
@@ -584,15 +580,13 @@ def train(
 
     # process helpers
     def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
-        if self.model is None or self.predict_func is None:
+        if self.model is None:
             return
 
         # create session data from message and convert it into a batch of 1
         model_data = self._create_model_data([message])
-        predict_dataset = model_data.as_tf_dataset(1)
-        batch_in = next(iter(predict_dataset))
 
-        return self.predict_func(batch_in)
+        return self.model.predict(model_data)
 
     def _predict_label(
         self, out: Dict[Text, tf.Tensor]
@@ -804,7 +798,6 @@ def load(
             elif meta[LOSS_TYPE] == "margin":
                 meta[SIMILARITY_TYPE] = "cosine"
 
-        logger.debug("Loading the model ...")
         model = DIET.load(
             tf_model_file,
             model_data_example,
@@ -814,30 +807,17 @@ def load(
             meta,
         )
         # build the graph for prediction
-        model.set_training_phase(False)
         predict_data_example = RasaModelData(
             label_key="label_ids",
             data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
-        # override train signature with predict signature
-        model.data_signature = predict_data_example.get_signature()
-        model.build_for_predict()
-        predict_dataset = predict_data_example.as_tf_dataset(
-            1, batch_strategy="sequence", shuffle=False
-        )
-        predict_func = tf.function(
-            func=model.predict, input_signature=[predict_dataset.element_spec]
-        )
-        batch_in = next(iter(predict_dataset))
-        predict_func(batch_in)
-        logger.debug("Finished loading the model.")
+        model.build_for_predict(predict_data_example)
 
         return cls(
             component_config=meta,
             inverted_label_dict=inv_label_dict,
             inverted_tag_dict=inv_tag_dict,
             model=model,
-            predict_func=predict_func,
             batch_tuple_sizes=batch_tuple_sizes,
         )
 
@@ -854,6 +834,10 @@ def __init__(
 
         # data
         self.data_signature = data_signature
+        self.predict_data_signature = {
+            k: vs for k, vs in data_signature.items() if "text" in k
+        }
+
         label_batch = label_data.prepare_batch()
         self.tf_label_data = self.batch_to_model_data_format(
             label_batch, label_data.get_signature()
@@ -870,7 +854,7 @@ def __init__(
         self._create_metrics()
         self._update_metrics_to_log()
 
-        # persist
+        # predict
         self.all_labels_embed = None
 
     def _create_metrics(self):
@@ -901,14 +885,14 @@ def _prepare_layers(self) -> None:
 
     @staticmethod
     def _create_sparse_dense_layer(
-        data_signature: List[FeatureSignature],
+        feature_signatures: List[FeatureSignature],
         name: Text,
         reg_lambda: float,
         dense_dim: int,
     ) -> Optional[tf_layers.DenseForSparse]:
 
         sparse = False
-        for is_sparse, shape in data_signature:
+        for is_sparse, shape in feature_signatures:
             if is_sparse:
                 sparse = is_sparse
             else:
@@ -1023,9 +1007,8 @@ def _prepare_entity_recognition_layers(self) -> None:
         )
 
     @staticmethod
-    def _get_mask_and_lengths(mask: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
-        sequence_lengths = tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
-        return mask, sequence_lengths
+    def _get_sequence_lengths(mask: tf.Tensor) -> tf.Tensor:
+        return tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
 
     def _combine_sparse_dense_features(
         self,
@@ -1082,13 +1065,7 @@ def _create_sequence(
 
         return transformed, x, lm_mask_bool
 
-    @staticmethod
-    def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
-        last_index = tf.maximum(0, sequence_lengths - 1)
-        idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
-        return tf.gather_nd(x, idxs)
-
-    def _build_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_labels = self._create_bow(
             self.tf_label_data["label_features"],
             self.tf_label_data["label_mask"][0],
@@ -1098,6 +1075,12 @@ def _build_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
 
         return all_labels_embed, all_labels
 
+    @staticmethod
+    def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        last_index = tf.maximum(0, sequence_lengths - 1)
+        idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
+        return tf.gather_nd(x, idxs)
+
     def _mask_loss(
         self, a_transformed: tf.Tensor, a: tf.Tensor, lm_mask_bool: tf.Tensor
     ) -> tf.Tensor:
@@ -1120,7 +1103,7 @@ def _mask_loss(
         )
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
-        all_labels_embed, all_labels = self._build_all_labels()
+        all_labels_embed, all_labels = self._create_all_labels()
 
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
@@ -1160,9 +1143,9 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        mask_text, sequence_lengths = self._get_mask_and_lengths(
-            tf_batch_data["text_mask"][0]
-        )
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths = self._get_sequence_lengths(mask_text)
+
         text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text", self.config[MASKED_LM]
         )
@@ -1199,24 +1182,25 @@ def batch_loss(
 
         return tf.math.add_n(losses)
 
-    def build_for_predict(self) -> None:
-        all_labels_embed, _ = self._build_all_labels()
-        self.all_labels_embed = tf.constant(all_labels_embed.numpy())
-
-    def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
+    def batch_predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-
-        mask_text, sequence_lengths = self._get_mask_and_lengths(
-            tf_batch_data["text_mask"][0]
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
         )
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths = self._get_sequence_lengths(mask_text)
+
         text_transformed, _, _ = self._create_sequence(
             tf_batch_data["text_features"], mask_text, "text"
         )
 
         out = {}
         if self.config[INTENT_CLASSIFICATION]:
+            if self.all_labels_embed is None:
+                self.all_labels_embed, _ = self._create_all_labels()
+
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
             cls_embed = self._tf_layers["embed.text"](cls)
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index fc2af632d462..f2135cfe2279 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -26,12 +26,14 @@ def __init__(self, *args, **kwargs):
         self._training = tf.ones((), tf.bool)
         self._optimizer = None
 
+        self._predict_function = None
+
     def batch_loss(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> tf.Tensor:
         raise NotImplementedError
 
-    def predict(
+    def batch_predict(
         self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
     ) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
@@ -126,6 +128,26 @@ def train_on_batch(
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
+    def build_for_predict(self, predict_data: RasaModelData, eager: bool = False):
+        def predict_dataset_function(  # to reuse the same helper method
+            _batch_size: Union[tf.Tensor, int]
+        ) -> tf.data.Dataset:
+            return predict_data.as_tf_dataset(_batch_size, "sequence", shuffle=False)
+
+        _, self._predict_function = self._get_tf_functions(
+            predict_dataset_function, self.batch_predict, eager, "prediction"
+        )
+
+    def predict(self, predict_data):
+        if self._predict_function is None:
+            logger.debug("There is no tensorflow prediction graph")
+            self.build_for_predict(predict_data)
+
+        predict_dataset = predict_data.as_tf_dataset(1)
+        batch_in = next(iter(predict_dataset))
+        self.set_training_phase(False)
+        return self._predict_function(batch_in)
+
     def save(self, model_file_name: Text) -> None:
         self.save_weights(model_file_name, save_format="tf")
 
@@ -133,6 +155,7 @@ def save(self, model_file_name: Text) -> None:
     def load(
         cls, model_file_name: Text, model_data_example: RasaModelData, *args, **kwargs
     ) -> "RasaModel":
+        logger.debug("Loading the model ...")
         # create empty model
         model = cls(*args, **kwargs)
         # need to train on 1 example to build weights of the correct size
@@ -148,7 +171,7 @@ def load(
         )
         # load trained weights
         model.load_weights(model_file_name)
-
+        logger.debug("Finished loading the model.")
         return model
 
     def _total_batch_loss(

From a1389f846085f5aa1e401d1d31d4f95e5c7bfafa Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 11:52:32 +0100
Subject: [PATCH 191/633] add types

---
 rasa/utils/tensorflow/tf_models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index f2135cfe2279..d411a46297d5 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -128,7 +128,9 @@ def train_on_batch(
         gradients = tape.gradient(total_loss, self.trainable_variables)
         self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
-    def build_for_predict(self, predict_data: RasaModelData, eager: bool = False):
+    def build_for_predict(
+        self, predict_data: RasaModelData, eager: bool = False
+    ) -> None:
         def predict_dataset_function(  # to reuse the same helper method
             _batch_size: Union[tf.Tensor, int]
         ) -> tf.data.Dataset:
@@ -138,7 +140,7 @@ def predict_dataset_function(  # to reuse the same helper method
             predict_dataset_function, self.batch_predict, eager, "prediction"
         )
 
-    def predict(self, predict_data):
+    def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
         if self._predict_function is None:
             logger.debug("There is no tensorflow prediction graph")
             self.build_for_predict(predict_data)

From 9089bfe4b5ecd76dcccffcc596e1c7105f83356b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 12:51:36 +0100
Subject: [PATCH 192/633] style updates

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 3 +--
 rasa/utils/tensorflow/tf_models.py                  | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 240284d20b56..a9eb553781fa 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -854,8 +854,7 @@ def __init__(
         self._create_metrics()
         self._update_metrics_to_log()
 
-        # predict
-        self.all_labels_embed = None
+        self.all_labels_embed = None  # needed for efficient prediction
 
     def _create_metrics(self):
         # self.metrics preserve order
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index d411a46297d5..4b23db66629b 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -142,7 +142,7 @@ def predict_dataset_function(  # to reuse the same helper method
 
     def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
         if self._predict_function is None:
-            logger.debug("There is no tensorflow prediction graph")
+            logger.debug("There is no tensorflow prediction graph.")
             self.build_for_predict(predict_data)
 
         predict_dataset = predict_data.as_tf_dataset(1)
@@ -207,14 +207,14 @@ def _get_tf_functions(
         dataset_function: Callable,
         call_model_function: Callable,
         eager: bool,
-        method: Text,
+        phase: Text,
     ) -> Tuple[Callable, Callable]:
         """Convert functions to tensorflow functions"""
 
         if eager:
             return dataset_function, call_model_function
 
-        logger.debug(f"Building tensorflow {method} graph...")
+        logger.debug(f"Building tensorflow {phase} graph...")
         # allows increasing batch size
         tf_dataset_function = tf.function(func=dataset_function)
 
@@ -225,7 +225,7 @@ def _get_tf_functions(
         )
         tf_method_function(next(iter(init_dataset)))
 
-        logger.debug(f"Finished building tensorflow {method} graph")
+        logger.debug(f"Finished building tensorflow {phase} graph.")
 
         return tf_dataset_function, tf_method_function
 

From b22969c0550aa7b856b4329142e7662234d673b5 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 12:56:33 +0100
Subject: [PATCH 193/633] refactoring

---
 rasa/core/policies/embedding_policy.py | 47 ++++++++++++--------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b78033bf5752..03330554e3c5 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -195,7 +195,8 @@ def _label_ids_for_Y(data_Y: np.ndarray) -> np.ndarray:
     def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
         """Prepare Y data for training: features for label_ids."""
 
-        if len(label_ids.shape) == 2:  # full dialogue featurizer is used
+        # full dialogue featurizer is used
+        if len(label_ids.shape) == 2:
             return np.stack(
                 [
                     np.stack(
@@ -207,33 +208,32 @@ def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
                     for seq_label_ids in label_ids
                 ]
             )
-        else:  # max history featurizer is used
-            return np.stack(
-                [self._encoded_all_label_ids[label_idx] for label_idx in label_ids]
-            )
+
+        # max history featurizer is used
+        return np.stack(
+            [self._encoded_all_label_ids[label_idx] for label_idx in label_ids]
+        )
 
     # noinspection PyPep8Naming
     def _create_model_data(
         self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
     ) -> RasaModelData:
-        """Combine all tf session related data into dict."""
+        """Combine all model related data into RasaModelData."""
+
+        label_ids = np.array([])
+        Y = np.array([])
 
         if data_Y is not None:
-            # training time
             label_ids = self._label_ids_for_Y(data_Y)
             Y = self._label_features_for_Y(label_ids)
             # explicitly add last dimension to label_ids
             # to track correctly dynamic sequences
             label_ids = np.expand_dims(label_ids, -1)
-        else:
-            # prediction time
-            label_ids = np.array([])
-            Y = np.array([])
 
-        model_data = RasaModelData(label_key="action_ids")
+        model_data = RasaModelData(label_key="label_ids")
         model_data.add_features("dialogue_features", [data_X])
-        model_data.add_features("bot_features", [Y])
-        model_data.add_features("action_ids", [label_ids])
+        model_data.add_features("label_features", [Y])
+        model_data.add_features("label_ids", [label_ids])
 
         return model_data
 
@@ -262,10 +262,9 @@ def train(
 
         # check if number of negatives is less than number of label_ids
         logger.debug(
-            "Check if num_neg {} is smaller "
-            "than number of label_ids {}, "
-            "else set num_neg to the number of label_ids - 1"
-            "".format(self.config[NUM_NEG], domain.num_actions)
+            f"Check if num_neg {self.config[NUM_NEG]} is smaller "
+            f"than number of label_ids {domain.num_actions}, "
+            f"else set num_neg to the number of label_ids - 1."
         )
         self.config[NUM_NEG] = min(self.config[NUM_NEG], domain.num_actions - 1)
 
@@ -384,8 +383,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         if not os.path.exists(path):
             raise Exception(
-                "Failed to load dialogue model. Path '{}' "
-                "doesn't exist".format(os.path.abspath(path))
+                f"Failed to load embedding policy model. Path "
+                f"'{os.path.abspath(path)}' doesn't exist"
             )
 
         file_name = "embedding_policy"
@@ -395,7 +394,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 
         with open(os.path.join(path, file_name + ".data_example.pkl"), "rb") as f:
             model_data_example = RasaModelData(
-                label_key="action_ids", data=pickle.load(f)
+                label_key="label_ids", data=pickle.load(f)
             )
 
         with open(
@@ -434,7 +433,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         # build the graph for prediction
         model.set_training_phase(False)
         model_data = RasaModelData(
-            label_key="action_ids",
+            label_key="label_ids",
             data={k: vs for k, vs in model_data_example.items() if "dialogue" in k},
         )
         model.build_for_predict()
@@ -603,9 +602,7 @@ def build_for_predict(self) -> None:
 
         self.all_labels_embed = tf.constant(all_labels_embed.numpy())
 
-    def predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], **kwargs
-    ) -> tf.Tensor:
+    def predict(self, batch_in: Tuple[tf.Tensor], **kwargs) -> tf.Tensor:
         dialogue_in = batch_in[0]
 
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)

From 021d9edabc6e6e9cbc9051786383b2924f2f1422 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 13:38:20 +0100
Subject: [PATCH 194/633] use native set_optimizer method

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/utils/tensorflow/tf_models.py                  | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c4b6831e2ca4..934fe676df2b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -871,7 +871,7 @@ def __init__(
         self._prepare_layers()
 
         # tf training
-        self._optimizer = tf.keras.optimizers.Adam(config[LEARNING_RATE])
+        self._set_optimizer(tf.keras.optimizers.Adam(config[LEARNING_RATE]))
         self._create_metrics()
         self._update_metrics_to_log()
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 4b23db66629b..f5c39a29db44 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -24,7 +24,6 @@ def __init__(self, *args, **kwargs):
         self.metrics_to_log = ["t_loss"]
 
         self._training = tf.ones((), tf.bool)
-        self._optimizer = None
 
         self._predict_function = None
 
@@ -126,7 +125,7 @@ def train_on_batch(
             total_loss = self._total_batch_loss(batch_in)
 
         gradients = tape.gradient(total_loss, self.trainable_variables)
-        self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
         self, predict_data: RasaModelData, eager: bool = False

From 35ac3d19be7c7ef4b084f1a584a12b6b960fbdbd Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 14:10:01 +0100
Subject: [PATCH 195/633] clean up after merge

---
 rasa/core/policies/embedding_policy.py        | 132 +++++-------------
 .../embedding_intent_classifier.py            |  38 +----
 .../selectors/embedding_response_selector.py  |  65 ++-------
 rasa/utils/tensorflow/constants.py            |  48 +++++++
 rasa/utils/tensorflow/tf_models.py            |  17 +--
 5 files changed, 109 insertions(+), 191 deletions(-)
 create mode 100644 rasa/utils/tensorflow/constants.py

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 03330554e3c5..3ad2fa2e5951 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -22,40 +22,12 @@
 from rasa.utils import train_utils
 from rasa.utils.tensorflow import tf_models, tf_layers
 from rasa.utils.tensorflow.tf_model_data import RasaModelData
+from rasa.utils.tensorflow.constants import *
 
 
 logger = logging.getLogger(__name__)
 
 
-# constants - configuration parameters
-HIDDEN_LAYERS_SIZES_PRE_DIAL = "hidden_layers_sizes_pre_dial"
-HIDDEN_LAYERS_SIZES_BOT = "hidden_layers_sizes_bot"
-TRANSFORMER_SIZE = "transformer_size"
-NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
-NUM_HEADS = "number_of_attention_heads"
-POS_ENCODING = "positional_encoding"
-MAX_SEQ_LENGTH = "maximum_sequence_length"
-BATCH_SIZES = "batch_sizes"
-BATCH_STRATEGY = "batch_strategy"
-EPOCHS = "epochs"
-RANDOM_SEED = "random_seed"
-EMBED_DIM = "embedding_dimension"
-NUM_NEG = "number_of_negative_examples"
-SIMILARITY_TYPE = "similarity_type"
-LOSS_TYPE = "loss_type"
-MU_POS = "maximum_positive_similarity"
-MU_NEG = "maximum_negative_similarity"
-USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
-SCALE_LOSS = "scale_loss"
-C2 = "l2_regularization"
-C_EMB = "c_emb"
-DROPRATE_DIAL = "droprate_dial"
-DROPRATE_BOT = "droprate_bot"
-EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
-EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
-RANKING_LENGTH = "ranking_length"
-
-
 class EmbeddingPolicy(Policy):
     """Transformer Embedding Dialogue Policy (TEDP)
 
@@ -148,7 +120,6 @@ def __init__(
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
         model: Optional[tf_models.RasaModel] = None,
-        predict_func: Optional[Callable] = None,
         **kwargs: Any,
     ) -> None:
         """Declare instant variables with default values"""
@@ -161,7 +132,6 @@ def __init__(
         self._load_params(**kwargs)
 
         self.model = model
-        self.predict_func = predict_func
 
         # encode all label_ids with numbers
         self._encoded_all_label_ids = None
@@ -324,16 +294,16 @@ def predict_action_probabilities(
 
         Return the list of probabilities for the next actions.
         """
-        if self.model is None or self.predict_func is None:
+        if self.model is None:
             return [0.0] * domain.num_actions
 
         # create model data from message and convert it into a batch of 1
         data_X = self.featurizer.create_X([tracker], domain)
         model_data = self._create_model_data(data_X)
-        predict_dataset = model_data.as_tf_dataset(1)
-        batch_in = next(iter(predict_dataset))
 
-        confidence = self.predict_func(batch_in)
+        output = self.model.predict(model_data)
+
+        confidence = output["action_scores"]
         confidence = confidence[0, -1, :].numpy()
 
         if self.config[LOSS_TYPE] == "softmax" and self.config[RANKING_LENGTH] > 0:
@@ -384,7 +354,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         if not os.path.exists(path):
             raise Exception(
                 f"Failed to load embedding policy model. Path "
-                f"'{os.path.abspath(path)}' doesn't exist"
+                f"'{os.path.abspath(path)}' doesn't exist."
             )
 
         file_name = "embedding_policy"
@@ -411,48 +381,26 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             elif meta[LOSS_TYPE] == "margin":
                 meta[SIMILARITY_TYPE] = "cosine"
 
-        model = TED(
+        model = TED.load(
+            tf_model_file,
+            model_data_example,
             meta,
             isinstance(featurizer, MaxHistoryTrackerFeaturizer),
             encoded_all_label_ids,
         )
 
-        logger.debug("Loading the model ...")
-        model.fit(
-            model_data_example,
-            1,
-            1,
-            0,
-            0,
-            batch_strategy=meta[BATCH_STRATEGY],
-            silent=True,  # don't confuse users with training output
-            eager=True,  # no need to build tf graph, eager is faster here
-        )
-        model.load_weights(tf_model_file)
-
         # build the graph for prediction
-        model.set_training_phase(False)
-        model_data = RasaModelData(
+        predict_data_example = RasaModelData(
             label_key="label_ids",
             data={k: vs for k, vs in model_data_example.items() if "dialogue" in k},
         )
-        model.build_for_predict()
-        predict_dataset = model_data.as_tf_dataset(
-            1, batch_strategy="sequence", shuffle=False
-        )
-        predict_func = tf.function(
-            func=model.predict, input_signature=[predict_dataset.element_spec]
-        )
-        batch_in = next(iter(predict_dataset))
-        predict_func(batch_in)
-        logger.debug("Finished loading the model.")
+        model.build_for_predict(predict_data_example)
 
         return cls(
             featurizer=featurizer,
             component_config=meta,
             priority=meta["priority"],
             model=model,
-            predict_func=predict_func,
         )
 
 
@@ -471,9 +419,6 @@ def __init__(
         # optimizer
         self._optimizer = tf.keras.optimizers.Adam()
 
-        # tf tensors
-        self.training = tf.ones((), tf.bool)
-
         self.all_labels_embed = None
         self._encoded_all_label_ids = encoded_all_label_ids
 
@@ -497,14 +442,14 @@ def _prepare_layers(self) -> None:
             self.config[SCALE_LOSS],
         )
         self._tf_layers["ffnn.dialogue"] = tf_layers.ReluFfn(
-            self.config[HIDDEN_LAYERS_SIZES_PRE_DIAL],
-            self.config[DROPRATE_DIAL],
+            self.config[HIDDEN_LAYERS_SIZES_DIALOGUE],
+            self.config[DROPRATE_DIALOGUE],
             self.config[C2],
             layer_name_suffix="dialogue",
         )
         self._tf_layers["ffnn.label"] = tf_layers.ReluFfn(
-            self.config[HIDDEN_LAYERS_SIZES_BOT],
-            self.config[DROPRATE_BOT],
+            self.config[HIDDEN_LAYERS_SIZES_LABEL],
+            self.config[DROPRATE_LABEL],
             self.config[C2],
             layer_name_suffix="label",
         )
@@ -515,7 +460,7 @@ def _prepare_layers(self) -> None:
             self.config[TRANSFORMER_SIZE] * 4,
             self.config[MAX_SEQ_LENGTH],
             self.config[C2],
-            self.config[DROPRATE_DIAL],
+            self.config[DROPRATE_DIALOGUE],
             name="dialogue_encoder",
         )
         self._tf_layers["embed.dialogue"] = tf_layers.Embed(
@@ -531,11 +476,13 @@ def _prepare_layers(self) -> None:
             self.config[SIMILARITY_TYPE],
         )
 
-    def set_training_phase(self, training: bool) -> None:
-        if training:
-            self.training = tf.ones((), tf.bool)
-        else:
-            self.training = tf.zeros((), tf.bool)
+    def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label = tf.constant(
+            self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
+        )
+        all_labels_embed = self._embed_label(all_label)
+
+        return all_label, all_labels_embed
 
     def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
@@ -544,9 +491,9 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(dialogue_in, -1) + 1)
 
-        dialogue = self._tf_layers["ffnn.dialogue"](dialogue_in, self.training)
+        dialogue = self._tf_layers["ffnn.dialogue"](dialogue_in, self._training)
         dialogue_transformed = self._tf_layers["transformer"](
-            dialogue, tf.expand_dims(mask, axis=-1), self.training
+            dialogue, tf.expand_dims(mask, axis=-1), self._training
         )
 
         if self.max_history_tracker_featurizer_used:
@@ -559,7 +506,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
         return dialogue_embed, mask
 
     def _embed_label(self, label_in: tf.Tensor) -> tf.Tensor:
-        label = self._tf_layers["ffnn.label"](label_in, self.training)
+        label = self._tf_layers["ffnn.label"](label_in, self._training)
         return self._tf_layers["embed.label"](label)
 
     def batch_loss(
@@ -571,14 +518,10 @@ def batch_loss(
             # add time dimension if max history featurizer is used
             label_in = label_in[:, tf.newaxis, :]
 
-        all_label = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
-        )
+        all_label, self.all_labels_embed = self._create_all_labels_embed()
 
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
-
         label_embed = self._embed_label(label_in)
-        self.all_labels_embed = self._embed_label(all_label)
 
         loss, acc = self._tf_layers["loss.label"](
             dialogue_embed,
@@ -594,17 +537,14 @@ def batch_loss(
 
         return loss
 
-    def build_for_predict(self) -> None:
-        all_label = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
-        )
-        all_labels_embed = self._embed_label(all_label)
-
-        self.all_labels_embed = tf.constant(all_labels_embed.numpy())
-
-    def predict(self, batch_in: Tuple[tf.Tensor], **kwargs) -> tf.Tensor:
+    def batch_predict(
+        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
+    ) -> Dict[Text, tf.Tensor]:
         dialogue_in = batch_in[0]
 
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels_embed()
+
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
 
         sim_all = self._tf_layers["loss.label"].sim(
@@ -613,4 +553,8 @@ def predict(self, batch_in: Tuple[tf.Tensor], **kwargs) -> tf.Tensor:
             mask,
         )
 
-        return train_utils.confidence_from_sim(sim_all, self.config[SIMILARITY_TYPE])
+        scores = self._tf_layers["loss.label"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+
+        return {"action_scores": scores}
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 0c5a45607a3b..92806f84dd3b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -18,6 +18,7 @@
 from rasa.utils import train_utils
 from rasa.utils.tensorflow import tf_layers, tf_models
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.constants import *
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -31,46 +32,9 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
 
-
 logger = logging.getLogger(__name__)
 
 
-# constants - configuration parameters
-HIDDEN_LAYERS_SIZES_TEXT = "hidden_layers_sizes_text"
-HIDDEN_LAYERS_SIZES_LABEL = "hidden_layers_sizes_label"
-SHARE_HIDDEN_LAYERS = "share_hidden_layers"
-TRANSFORMER_SIZE = "transformer_size"
-NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
-NUM_HEADS = "number_of_attention_heads"
-POS_ENCODING = "positional_encoding"
-MAX_SEQ_LENGTH = "maximum_sequence_length"
-BATCH_SIZES = "batch_sizes"
-BATCH_STRATEGY = "batch_strategy"
-EPOCHS = "epochs"
-RANDOM_SEED = "random_seed"
-LEARNING_RATE = "learning_rate"
-DENSE_DIM = "dense_dimensions"
-EMBED_DIM = "embedding_dimension"
-NUM_NEG = "number_of_negative_examples"
-SIMILARITY_TYPE = "similarity_type"
-LOSS_TYPE = "loss_type"
-MU_POS = "maximum_positive_similarity"
-MU_NEG = "maximum_negative_similarity"
-USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
-SCALE_LOSS = "scale_loss"
-C2 = "l2_regularization"
-C_EMB = "c_emb"
-DROPRATE = "droprate"
-UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
-EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
-EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
-INTENT_CLASSIFICATION = "perform_intent_classification"
-ENTITY_RECOGNITION = "perform_entity_recognition"
-MASKED_LM = "use_masked_language_model"
-SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
-RANKING_LENGTH = "ranking_length"
-
-
 class EmbeddingIntentClassifier(EntityExtractor):
     """label classifier using supervised embeddings.
 
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index f4d148992a2c..25751404c20f 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -1,44 +1,10 @@
 import logging
-import typing
 from typing import Any, Dict, Text
 
+from rasa.nlu.training_data import TrainingData, Message
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.components import any_of
-from rasa.nlu.classifiers.embedding_intent_classifier import (
-    EmbeddingIntentClassifier,
-    USE_MAX_SIM_NEG,
-    HIDDEN_LAYERS_SIZES_TEXT,
-    HIDDEN_LAYERS_SIZES_LABEL,
-    SHARE_HIDDEN_LAYERS,
-    TRANSFORMER_SIZE,
-    NUM_TRANSFORMER_LAYERS,
-    POS_ENCODING,
-    NUM_HEADS,
-    MAX_SEQ_LENGTH,
-    BATCH_SIZES,
-    BATCH_STRATEGY,
-    EPOCHS,
-    RANDOM_SEED,
-    LEARNING_RATE,
-    DENSE_DIM,
-    EMBED_DIM,
-    NUM_NEG,
-    SIMILARITY_TYPE,
-    LOSS_TYPE,
-    MU_POS,
-    MU_NEG,
-    SCALE_LOSS,
-    C2,
-    C_EMB,
-    DROPRATE,
-    UNIDIRECTIONAL_ENCODER,
-    EVAL_NUM_EPOCHS,
-    EVAL_NUM_EXAMPLES,
-    INTENT_CLASSIFICATION,
-    ENTITY_RECOGNITION,
-    MASKED_LM,
-    SPARSE_INPUT_DROPOUT,
-    RANKING_LENGTH,
-)
+from rasa.utils.tensorflow.constants import *
 from rasa.nlu.constants import (
     RESPONSE_ATTRIBUTE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -47,11 +13,10 @@
     TEXT_ATTRIBUTE,
     SPARSE_FEATURE_NAMES,
 )
+from rasa.utils.tensorflow.tf_model_data import RasaModelData
 
-logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.training_data import Message
+logger = logging.getLogger(__name__)
 
 
 class ResponseSelector(EmbeddingIntentClassifier):
@@ -170,7 +135,7 @@ class ResponseSelector(EmbeddingIntentClassifier):
 
     # end default properties (DOC MARKER - don't remove)
 
-    def _load_selector_params(self, config: Dict[Text, Any]):
+    def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
         if not self.retrieval_intent:
             # retrieval intent was left to its default value
@@ -180,13 +145,12 @@ def _load_selector_params(self, config: Dict[Text, Any]):
             )
 
     def _load_params(self) -> None:
-        super()._load_params()
         self._load_selector_params(self.component_config)
 
     @staticmethod
     def _set_message_property(
-        message: "Message", prediction_dict: Dict[Text, Any], selector_key: Text
-    ):
+        message: Message, prediction_dict: Dict[Text, Any], selector_key: Text
+    ) -> None:
 
         message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
         message_selector_properties[selector_key] = prediction_dict
@@ -196,7 +160,7 @@ def _set_message_property(
             add_to_output=True,
         )
 
-    def preprocess_train_data(self, training_data):
+    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         """Performs sanity checks on training data, extracts encodings for labels
         and prepares data for training"""
         if self.retrieval_intent:
@@ -211,20 +175,21 @@ def preprocess_train_data(self, training_data):
             training_data, label_id_dict, attribute=RESPONSE_ATTRIBUTE
         )
 
-        session_data = self._create_model_data(
+        model_data = self._create_model_data(
             training_data.intent_examples,
             label_id_dict,
             label_attribute=RESPONSE_ATTRIBUTE,
         )
 
-        self.check_input_dimension_consistency(session_data)
+        self.check_input_dimension_consistency(model_data)
 
-        return session_data
+        return model_data
 
-    def process(self, message: "Message", **kwargs: Any) -> None:
+    def process(self, message: Message, **kwargs: Any) -> None:
         """Return the most likely response and its similarity to the input."""
 
-        label, label_ranking = self.predict_label(message)
+        out = self._predict(message)
+        label, label_ranking = self._predict_label(out)
 
         selector_key = (
             self.retrieval_intent
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
new file mode 100644
index 000000000000..a47b9885aaf9
--- /dev/null
+++ b/rasa/utils/tensorflow/constants.py
@@ -0,0 +1,48 @@
+# constants - configuration parameters
+
+HIDDEN_LAYERS_SIZES_TEXT = "hidden_layers_sizes_text"
+HIDDEN_LAYERS_SIZES_LABEL = "hidden_layers_sizes_label"
+HIDDEN_LAYERS_SIZES_DIALOGUE = "hidden_layers_sizes_dialogue"
+SHARE_HIDDEN_LAYERS = "share_hidden_layers"
+
+TRANSFORMER_SIZE = "transformer_size"
+NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
+NUM_HEADS = "number_of_attention_heads"
+UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
+
+POS_ENCODING = "positional_encoding"
+MAX_SEQ_LENGTH = "maximum_sequence_length"
+
+BATCH_SIZES = "batch_sizes"
+BATCH_STRATEGY = "batch_strategy"
+EPOCHS = "epochs"
+RANDOM_SEED = "random_seed"
+LEARNING_RATE = "learning_rate"
+
+DENSE_DIM = "dense_dimensions"
+EMBED_DIM = "embedding_dimension"
+
+SIMILARITY_TYPE = "similarity_type"
+LOSS_TYPE = "loss_type"
+NUM_NEG = "number_of_negative_examples"
+MU_POS = "maximum_positive_similarity"
+MU_NEG = "maximum_negative_similarity"
+USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
+
+SCALE_LOSS = "scale_loss"
+C2 = "l2_regularization"
+C_EMB = "c_emb"
+DROPRATE = "droprate"
+DROPRATE_DIALOGUE = "droprate_dialogue"
+DROPRATE_LABEL = "droprate_label"
+
+EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
+EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
+
+INTENT_CLASSIFICATION = "perform_intent_classification"
+ENTITY_RECOGNITION = "perform_entity_recognition"
+MASKED_LM = "use_masked_language_model"
+
+SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
+
+RANKING_LENGTH = "ranking_length"
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 4b23db66629b..fac799363b8f 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -145,7 +145,7 @@ def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
             logger.debug("There is no tensorflow prediction graph.")
             self.build_for_predict(predict_data)
 
-        predict_dataset = predict_data.as_tf_dataset(1)
+        predict_dataset = predict_data.as_tf_dataset(batch_size=1)
         batch_in = next(iter(predict_dataset))
         self.set_training_phase(False)
         return self._predict_function(batch_in)
@@ -163,10 +163,10 @@ def load(
         # need to train on 1 example to build weights of the correct size
         model.fit(
             model_data_example,
-            1,
-            1,
-            0,
-            0,
+            epochs=1,
+            batch_size=1,
+            evaluate_every_num_epochs=0,
+            evaluate_on_num_examples=0,
             batch_strategy="sequence",
             silent=True,  # don't confuse users with training output
             eager=True,  # no need to build tf graph, eager is faster here
@@ -230,7 +230,7 @@ def _get_tf_functions(
         return tf_dataset_function, tf_method_function
 
     def _get_tf_train_functions(
-        self, eager: bool, model_data: RasaModelData, batch_strategy: Text,
+        self, eager: bool, model_data: RasaModelData, batch_strategy: Text
     ) -> Tuple[Callable, Callable]:
         """Create train tensorflow functions"""
 
@@ -261,10 +261,7 @@ def evaluation_dataset_function(
                 )
 
             return self._get_tf_functions(
-                evaluation_dataset_function,
-                self._total_batch_loss,
-                eager,
-                "evaluation",
+                evaluation_dataset_function, self._total_batch_loss, eager, "evaluation"
             )
 
         return None, None

From 5b530410573bb620fa9aa19ca57918e7ddd534cf Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 14:14:29 +0100
Subject: [PATCH 196/633] update method signatures

---
 rasa/core/policies/embedding_policy.py        |  8 ++------
 .../embedding_intent_classifier.py            |  8 ++------
 rasa/utils/tensorflow/tf_models.py            | 19 +++++--------------
 3 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 3ad2fa2e5951..61a2db5f0fcb 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -509,9 +509,7 @@ def _embed_label(self, label_in: tf.Tensor) -> tf.Tensor:
         label = self._tf_layers["ffnn.label"](label_in, self._training)
         return self._tf_layers["embed.label"](label)
 
-    def batch_loss(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> tf.Tensor:
+    def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
         dialogue_in, label_in, _ = batch_in
 
         if self.max_history_tracker_featurizer_used:
@@ -537,9 +535,7 @@ def batch_loss(
 
         return loss
 
-    def batch_predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> Dict[Text, tf.Tensor]:
+    def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
         dialogue_in = batch_in[0]
 
         if self.all_labels_embed is None:
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 92806f84dd3b..e824503bac9e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1108,9 +1108,7 @@ def _entity_loss(
 
         return loss, f1
 
-    def batch_loss(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> tf.Tensor:
+    def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
@@ -1152,9 +1150,7 @@ def batch_loss(
 
         return tf.math.add_n(losses)
 
-    def batch_predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> Dict[Text, tf.Tensor]:
+    def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
         )
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index fac799363b8f..27d76075f79b 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -28,14 +28,10 @@ def __init__(self, *args, **kwargs):
 
         self._predict_function = None
 
-    def batch_loss(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> tf.Tensor:
+    def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
         raise NotImplementedError
 
-    def batch_predict(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> Dict[Text, tf.Tensor]:
+    def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
 
     def set_training_phase(self, training: bool) -> None:
@@ -117,9 +113,7 @@ def fit(
         if not disable:
             logger.info("Finished training.")
 
-    def train_on_batch(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> None:
+    def train_on_batch(self, batch_in: List[tf.Tensor]) -> None:
         """Train on batch"""
 
         with tf.GradientTape() as tape:
@@ -176,9 +170,7 @@ def load(
         logger.debug("Finished loading the model.")
         return model
 
-    def _total_batch_loss(
-        self, batch_in: Union[Tuple[np.ndarray], Tuple[tf.Tensor]]
-    ) -> tf.Tensor:
+    def _total_batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
         """Calculate total loss"""
 
         prediction_loss = self.batch_loss(batch_in)
@@ -289,8 +281,7 @@ def _should_evaluate(
 
     @staticmethod
     def batch_to_model_data_format(
-        batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]],
-        data_signature: Dict[Text, List[FeatureSignature]],
+        batch: List[tf.Tensor], data_signature: Dict[Text, List[FeatureSignature]]
     ) -> Dict[Text, List[tf.Tensor]]:
         """Convert input batch tensors into batch data format.
     

From 10e40b086f4da07a095551f65ceee3f4ebc1f206 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 14:30:12 +0100
Subject: [PATCH 197/633] fix response selector

---
 rasa/core/policies/embedding_policy.py            | 8 ++++----
 rasa/nlu/selectors/embedding_response_selector.py | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 61a2db5f0fcb..9d2cc1307c9d 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -41,10 +41,10 @@ class EmbeddingPolicy(Policy):
         # nn architecture
         # a list of hidden layers sizes before user embed layer
         # number of hidden layers is equal to the length of this list
-        HIDDEN_LAYERS_SIZES_PRE_DIAL: [],
+        HIDDEN_LAYERS_SIZES_DIALOGUE: [],
         # a list of hidden layers sizes before bot embed layer
         # number of hidden layers is equal to the length of this list
-        HIDDEN_LAYERS_SIZES_BOT: [],
+        HIDDEN_LAYERS_SIZES_LABEL: [],
         # number of units in transformer
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
@@ -94,9 +94,9 @@ class EmbeddingPolicy(Policy):
         # between embeddings of different labels
         C_EMB: 0.8,
         # dropout rate for dial nn
-        DROPRATE_DIAL: 0.1,
+        DROPRATE_DIALOGUE: 0.1,
         # dropout rate for bot nn
-        DROPRATE_BOT: 0.0,
+        DROPRATE_LABEL: 0.0,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 25751404c20f..6fd00a6c0691 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -144,7 +144,8 @@ def _load_selector_params(self, config: Dict[Text, Any]) -> None:
                 "on training examples combining all retrieval intents."
             )
 
-    def _load_params(self) -> None:
+    def _check_config_parameters(self) -> None:
+        super()._check_config_parameters()
         self._load_selector_params(self.component_config)
 
     @staticmethod

From 4521096d8bfaa36279c03939057bec7ca2beb8c4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 14:47:39 +0100
Subject: [PATCH 198/633] Use set optimizer

---
 rasa/core/policies/embedding_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 9d2cc1307c9d..b7184fc14b4d 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -417,7 +417,7 @@ def __init__(
         self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
 
         # optimizer
-        self._optimizer = tf.keras.optimizers.Adam()
+        self._set_optimizer(tf.keras.optimizers.Adam())
 
         self.all_labels_embed = None
         self._encoded_all_label_ids = encoded_all_label_ids

From 87cc1c528385d56823ed29e09329bd79d9dd729b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 15:18:46 +0100
Subject: [PATCH 199/633] use model.save

---
 rasa/core/policies/embedding_policy.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b7184fc14b4d..eeda92979cb0 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -297,14 +297,14 @@ def predict_action_probabilities(
         if self.model is None:
             return [0.0] * domain.num_actions
 
-        # create model data from message and convert it into a batch of 1
+        # create model data from tracker
         data_X = self.featurizer.create_X([tracker], domain)
         model_data = self._create_model_data(data_X)
 
         output = self.model.predict(model_data)
 
-        confidence = output["action_scores"]
-        confidence = confidence[0, -1, :].numpy()
+        confidence = output["action_scores"].numpy()
+        confidence = confidence[0, -1, :]
 
         if self.config[LOSS_TYPE] == "softmax" and self.config[RANKING_LENGTH] > 0:
             confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
@@ -324,7 +324,7 @@ def persist(self, path: Text):
 
         self.featurizer.persist(path)
 
-        self.model.save_weights(tf_model_file, save_format="tf")
+        self.model.save(tf_model_file)
 
         with open(os.path.join(path, file_name + ".tf_config.pkl"), "wb") as f:
             pickle.dump(self._tf_config, f)
@@ -342,8 +342,6 @@ def persist(self, path: Text):
         ) as f:
             pickle.dump(self._encoded_all_label_ids, f)
 
-        return {"file": file_name}
-
     @classmethod
     def load(cls, path: Text) -> "EmbeddingPolicy":
         """Loads a policy from the storage.
@@ -384,9 +382,11 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         model = TED.load(
             tf_model_file,
             model_data_example,
-            meta,
-            isinstance(featurizer, MaxHistoryTrackerFeaturizer),
-            encoded_all_label_ids,
+            config=meta,
+            max_history_tracker_featurizer_used=isinstance(
+                featurizer, MaxHistoryTrackerFeaturizer
+            ),
+            encoded_all_label_ids=encoded_all_label_ids,
         )
 
         # build the graph for prediction

From 912f154e03c351bf4da50e8a486ccb5b2d446f9b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 15:39:28 +0100
Subject: [PATCH 200/633] add types

---
 rasa/utils/tensorflow/tf_layers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index d60cc8fcc0e5..6f60091efafc 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -290,14 +290,14 @@ def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
         return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
 
     @staticmethod
-    def _get_angles(pos, i, d_model):
+    def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
         angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
         return pos * angle_rates
 
     @classmethod
-    def _positional_encoding(cls, position, d_model) -> tf.Tensor:
+    def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
         angle_rads = cls._get_angles(
-            np.arange(position)[:, np.newaxis],
+            np.arange(max_position)[:, np.newaxis],
             np.arange(d_model)[np.newaxis, :],
             d_model,
         )

From 0726d6c194d4a30504b38ee830678159f9b17958 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Fri, 24 Jan 2020 15:40:50 +0100
Subject: [PATCH 201/633] Update rasa/core/policies/embedding_policy.py

---
 rasa/core/policies/embedding_policy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index eeda92979cb0..3cd7bdad2075 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -461,6 +461,7 @@ def _prepare_layers(self) -> None:
             self.config[MAX_SEQ_LENGTH],
             self.config[C2],
             self.config[DROPRATE_DIALOGUE],
+            unidirectional=True,
             name="dialogue_encoder",
         )
         self._tf_layers["embed.dialogue"] = tf_layers.Embed(

From 9c7e51276f02218e13a6427a837c79d0f298919f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 15:42:00 +0100
Subject: [PATCH 202/633] review comments

---
 rasa/core/policies/embedding_policy.py | 29 +++++++++++++++++---------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 3cd7bdad2075..de3682529359 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -21,7 +21,7 @@
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils import train_utils
 from rasa.utils.tensorflow import tf_models, tf_layers
-from rasa.utils.tensorflow.tf_model_data import RasaModelData
+from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import *
 
 
@@ -245,6 +245,7 @@ def train(
         self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
         self.model = TED(
+            model_data.get_signature(),
             self.config,
             isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
             self._encoded_all_label_ids,
@@ -382,6 +383,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         model = TED.load(
             tf_model_file,
             model_data_example,
+            data_signature=model_data_example.get_signature(),
             config=meta,
             max_history_tracker_featurizer_used=isinstance(
                 featurizer, MaxHistoryTrackerFeaturizer
@@ -407,6 +409,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
 class TED(tf_models.RasaModel):
     def __init__(
         self,
+        data_signature: Dict[Text, List[FeatureSignature]],
         config: Dict[Text, Any],
         max_history_tracker_featurizer_used: bool,
         encoded_all_label_ids: np.ndarray,
@@ -416,6 +419,12 @@ def __init__(
         self.config = config
         self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
 
+        # data
+        self.data_signature = data_signature
+        self.predict_data_signature = {
+            k: vs for k, vs in data_signature.items() if "dialogue" in k
+        }
+
         # optimizer
         self._set_optimizer(tf.keras.optimizers.Adam())
 
@@ -511,24 +520,22 @@ def _embed_label(self, label_in: tf.Tensor) -> tf.Tensor:
         return self._tf_layers["embed.label"](label)
 
     def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
-        dialogue_in, label_in, _ = batch_in
+        batch = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        dialogue_in = batch["dialogue_features"][0]
+        label_in = batch["label_features"][0]
 
         if self.max_history_tracker_featurizer_used:
             # add time dimension if max history featurizer is used
             label_in = label_in[:, tf.newaxis, :]
 
-        all_label, self.all_labels_embed = self._create_all_labels_embed()
+        all_label, all_labels_embed = self._create_all_labels_embed()
 
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
         label_embed = self._embed_label(label_in)
 
         loss, acc = self._tf_layers["loss.label"](
-            dialogue_embed,
-            label_embed,
-            label_in,
-            self.all_labels_embed,
-            all_label,
-            mask,
+            dialogue_embed, label_embed, label_in, all_labels_embed, all_label, mask
         )
 
         self.metric_loss.update_state(loss)
@@ -537,7 +544,9 @@ def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
         return loss
 
     def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
-        dialogue_in = batch_in[0]
+        batch = self.batch_to_model_data_format(batch_in, self.predict_data_signature)
+
+        dialogue_in = batch["dialogue_features"][0]
 
         if self.all_labels_embed is None:
             _, self.all_labels_embed = self._create_all_labels_embed()

From 1a5ee6b2f332b78ba025c142510cbb976980caea Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 15:57:27 +0100
Subject: [PATCH 203/633] review comments

---
 rasa/core/policies/embedding_policy.py              | 8 +++-----
 rasa/core/policies/keras_policy.py                  | 1 -
 rasa/nlu/classifiers/embedding_intent_classifier.py | 6 +++---
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index de3682529359..8d252af9ed46 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -486,10 +486,8 @@ def _prepare_layers(self) -> None:
             self.config[SIMILARITY_TYPE],
         )
 
-    def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_label = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_label"
-        )
+    def _create_all_labels_embed(self) -> Tuple[np.ndarray, tf.Tensor]:
+        all_label = self._encoded_all_label_ids.astype(np.float32)
         all_labels_embed = self._embed_label(all_label)
 
         return all_label, all_labels_embed
@@ -515,7 +513,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
 
         return dialogue_embed, mask
 
-    def _embed_label(self, label_in: tf.Tensor) -> tf.Tensor:
+    def _embed_label(self, label_in: Union[tf.Tensor, np.ndarray]) -> tf.Tensor:
         label = self._tf_layers["ffnn.label"](label_in, self._training)
         return self._tf_layers["embed.label"](label)
 
diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index f53ce6b81681..2f6cd61bb49c 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -18,7 +18,6 @@
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils.common import obtain_verbosity
-from rasa.utils.tensorflow import tf_models
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 
 # there are a number of issues with imports from tensorflow. hence the deactivation
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 02355438c77d..8b14afe58f31 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1043,7 +1043,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         )
         all_labels_embed = self._tf_layers["embed.label"](all_labels)
 
-        return all_labels_embed, all_labels
+        return all_labels, all_labels_embed
 
     @staticmethod
     def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
@@ -1073,7 +1073,7 @@ def _mask_loss(
         )
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
-        all_labels_embed, all_labels = self._create_all_labels()
+        all_labels, all_labels_embed = self._create_all_labels()
 
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
@@ -1165,7 +1165,7 @@ def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
         out = {}
         if self.config[INTENT_CLASSIFICATION]:
             if self.all_labels_embed is None:
-                self.all_labels_embed, _ = self._create_all_labels()
+                _, self.all_labels_embed = self._create_all_labels()
 
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)

From 79dc7472fba43c45f3d3e612bf0fb0a024a8c07e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 16:38:17 +0100
Subject: [PATCH 204/633] import RasaModel directly

---
 rasa/core/policies/embedding_policy.py              | 9 +++++----
 rasa/nlu/classifiers/embedding_intent_classifier.py | 7 ++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 8d252af9ed46..e90a6e48f56c 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -6,7 +6,7 @@
 import numpy as np
 import tensorflow as tf
 
-from typing import Any, List, Optional, Text, Dict, Tuple, Union, Callable
+from typing import Any, List, Optional, Text, Dict, Tuple, Union
 
 import rasa.utils.io
 from rasa.core.domain import Domain
@@ -20,7 +20,8 @@
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils import train_utils
-from rasa.utils.tensorflow import tf_models, tf_layers
+from rasa.utils.tensorflow import tf_layers
+from rasa.utils.tensorflow.tf_models import RasaModel
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import *
 
@@ -119,7 +120,7 @@ def __init__(
         featurizer: Optional[TrackerFeaturizer] = None,
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
-        model: Optional[tf_models.RasaModel] = None,
+        model: Optional[RasaModel] = None,
         **kwargs: Any,
     ) -> None:
         """Declare instant variables with default values"""
@@ -406,7 +407,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         )
 
 
-class TED(tf_models.RasaModel):
+class TED(RasaModel):
     def __init__(
         self,
         data_signature: Dict[Text, List[FeatureSignature]],
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8b14afe58f31..40a517cdf88a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -16,7 +16,8 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import any_of
 from rasa.utils import train_utils
-from rasa.utils.tensorflow import tf_layers, tf_models
+from rasa.utils.tensorflow import tf_layers
+from rasa.utils.tensorflow.tf_models import RasaModel
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import *
 from rasa.nlu.constants import (
@@ -174,7 +175,7 @@ def __init__(
         component_config: Optional[Dict[Text, Any]] = None,
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        model: Optional[tf_models.RasaModel] = None,
+        model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
         """Declare instance variables with default values"""
@@ -793,7 +794,7 @@ def load(
         )
 
 
-class DIET(tf_models.RasaModel):
+class DIET(RasaModel):
     def __init__(
         self,
         data_signature: Dict[Text, List[FeatureSignature]],

From 18b32903382b3717d7662e714684fa71b8ba1d4c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 16:41:48 +0100
Subject: [PATCH 205/633] naming and types

---
 rasa/core/policies/embedding_policy.py        | 16 ++++++++++------
 .../embedding_intent_classifier.py            |  8 ++++++--
 rasa/utils/tensorflow/tf_models.py            | 19 ++++++++++++++-----
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index e90a6e48f56c..56857a3e3c93 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -488,10 +488,10 @@ def _prepare_layers(self) -> None:
         )
 
     def _create_all_labels_embed(self) -> Tuple[np.ndarray, tf.Tensor]:
-        all_label = self._encoded_all_label_ids.astype(np.float32)
+        all_labels = self._encoded_all_label_ids.astype(np.float32)
         all_labels_embed = self._embed_label(all_label)
 
-        return all_label, all_labels_embed
+        return all_labels, all_labels_embed
 
     def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
@@ -518,7 +518,9 @@ def _embed_label(self, label_in: Union[tf.Tensor, np.ndarray]) -> tf.Tensor:
         label = self._tf_layers["ffnn.label"](label_in, self._training)
         return self._tf_layers["embed.label"](label)
 
-    def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
+    def batch_loss(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> tf.Tensor:
         batch = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         dialogue_in = batch["dialogue_features"][0]
@@ -528,13 +530,13 @@ def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
             # add time dimension if max history featurizer is used
             label_in = label_in[:, tf.newaxis, :]
 
-        all_label, all_labels_embed = self._create_all_labels_embed()
+        all_labels, all_labels_embed = self._create_all_labels_embed()
 
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
         label_embed = self._embed_label(label_in)
 
         loss, acc = self._tf_layers["loss.label"](
-            dialogue_embed, label_embed, label_in, all_labels_embed, all_label, mask
+            dialogue_embed, label_embed, label_in, all_labels_embed, all_labels, mask
         )
 
         self.metric_loss.update_state(loss)
@@ -542,7 +544,9 @@ def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
 
         return loss
 
-    def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    def batch_predict(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
         batch = self.batch_to_model_data_format(batch_in, self.predict_data_signature)
 
         dialogue_in = batch["dialogue_features"][0]
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 40a517cdf88a..cf12b2f0e4ba 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1109,7 +1109,9 @@ def _entity_loss(
 
         return loss, f1
 
-    def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
+    def batch_loss(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
         mask_text = tf_batch_data["text_mask"][0]
@@ -1151,7 +1153,9 @@ def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
 
         return tf.math.add_n(losses)
 
-    def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    def batch_predict(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
         )
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 14d8490d316b..2edde57eab3a 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -27,10 +27,14 @@ def __init__(self, *args, **kwargs):
 
         self._predict_function = None
 
-    def batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
+    def batch_loss(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> tf.Tensor:
         raise NotImplementedError
 
-    def batch_predict(self, batch_in: List[tf.Tensor]) -> Dict[Text, tf.Tensor]:
+    def batch_predict(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
 
     def set_training_phase(self, training: bool) -> None:
@@ -112,7 +116,9 @@ def fit(
         if not disable:
             logger.info("Finished training.")
 
-    def train_on_batch(self, batch_in: List[tf.Tensor]) -> None:
+    def train_on_batch(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> None:
         """Train on batch"""
 
         with tf.GradientTape() as tape:
@@ -169,7 +175,9 @@ def load(
         logger.debug("Finished loading the model.")
         return model
 
-    def _total_batch_loss(self, batch_in: List[tf.Tensor]) -> tf.Tensor:
+    def _total_batch_loss(
+        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+    ) -> tf.Tensor:
         """Calculate total loss"""
 
         prediction_loss = self.batch_loss(batch_in)
@@ -280,7 +288,8 @@ def _should_evaluate(
 
     @staticmethod
     def batch_to_model_data_format(
-        batch: List[tf.Tensor], data_signature: Dict[Text, List[FeatureSignature]]
+        batch: Union[List[tf.Tensor], List[np.ndarray]],
+        data_signature: Dict[Text, List[FeatureSignature]],
     ) -> Dict[Text, List[tf.Tensor]]:
         """Convert input batch tensors into batch data format.
     

From ba214a854f29b06528148cb5f249ab8461071cb1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 24 Jan 2020 17:13:29 +0100
Subject: [PATCH 206/633] use label_data

---
 rasa/core/policies/embedding_policy.py | 61 ++++++++++++++------------
 rasa/utils/tensorflow/tf_models.py     | 10 ++---
 2 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 56857a3e3c93..482823ec87ad 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -134,9 +134,7 @@ def __init__(
 
         self.model = model
 
-        # encode all label_ids with numbers
-        self._encoded_all_label_ids = None
-
+        self._label_data = None
         self.data_example = None
 
         self._tf_config = train_utils.load_tf_config(self.config)
@@ -172,7 +170,7 @@ def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
                 [
                     np.stack(
                         [
-                            self._encoded_all_label_ids[label_idx]
+                            self._label_data.get("label_features")[0][label_idx]
                             for label_idx in seq_label_ids
                         ]
                     )
@@ -182,7 +180,10 @@ def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
 
         # max history featurizer is used
         return np.stack(
-            [self._encoded_all_label_ids[label_idx] for label_idx in label_ids]
+            [
+                self._label_data.get("label_features")[0][label_idx]
+                for label_idx in label_ids
+            ]
         )
 
     # noinspection PyPep8Naming
@@ -208,6 +209,16 @@ def _create_model_data(
 
         return model_data
 
+    def _create_label_data(self, domain: Domain) -> RasaModelData:
+        # encode all label_ids with policies' featurizer
+        state_featurizer = self.featurizer.state_featurizer
+        all_labels = state_featurizer.create_encoded_all_actions(domain)
+        all_labels = all_labels.astype(np.float32)
+
+        label_data = RasaModelData(label_key="label_features")
+        label_data.add_features("label_features", [all_labels])
+        return label_data
+
     # training methods
     def train(
         self,
@@ -225,11 +236,7 @@ def train(
         # dealing with training data
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
 
-        # encode all label_ids with policies' featurizer
-        state_featurizer = self.featurizer.state_featurizer
-        self._encoded_all_label_ids = state_featurizer.create_encoded_all_actions(
-            domain
-        )
+        self._label_data = self._create_label_data(domain)
 
         # check if number of negatives is less than number of label_ids
         logger.debug(
@@ -249,7 +256,7 @@ def train(
             model_data.get_signature(),
             self.config,
             isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
-            self._encoded_all_label_ids,
+            self._label_data,
         )
 
         self.model.fit(
@@ -339,10 +346,8 @@ def persist(self, path: Text):
         with open(os.path.join(path, file_name + ".data_example.pkl"), "wb") as f:
             pickle.dump(self.data_example, f)
 
-        with open(
-            os.path.join(path, file_name + ".encoded_all_label_ids.pkl"), "wb"
-        ) as f:
-            pickle.dump(self._encoded_all_label_ids, f)
+        with open(os.path.join(path, file_name + ".label_data.pkl"), "wb") as f:
+            pickle.dump(self._label_data, f)
 
     @classmethod
     def load(cls, path: Text) -> "EmbeddingPolicy":
@@ -367,10 +372,8 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
                 label_key="label_ids", data=pickle.load(f)
             )
 
-        with open(
-            os.path.join(path, file_name + ".encoded_all_label_ids.pkl"), "rb"
-        ) as f:
-            encoded_all_label_ids = pickle.load(f)
+        with open(os.path.join(path, file_name + ".label_data.pkl"), "rb") as f:
+            label_data = pickle.load(f)
 
         with open(os.path.join(path, file_name + ".meta.pkl"), "rb") as f:
             meta = pickle.load(f)
@@ -389,7 +392,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
             max_history_tracker_featurizer_used=isinstance(
                 featurizer, MaxHistoryTrackerFeaturizer
             ),
-            encoded_all_label_ids=encoded_all_label_ids,
+            label_data=label_data,
         )
 
         # build the graph for prediction
@@ -413,7 +416,7 @@ def __init__(
         data_signature: Dict[Text, List[FeatureSignature]],
         config: Dict[Text, Any],
         max_history_tracker_featurizer_used: bool,
-        encoded_all_label_ids: np.ndarray,
+        label_data: RasaModelData,
     ):
         super().__init__()
 
@@ -430,7 +433,11 @@ def __init__(
         self._set_optimizer(tf.keras.optimizers.Adam())
 
         self.all_labels_embed = None
-        self._encoded_all_label_ids = encoded_all_label_ids
+
+        label_batch = label_data.prepare_batch()
+        self.tf_label_data = self.batch_to_model_data_format(
+            label_batch, label_data.get_signature()
+        )
 
         # metrics
         self.metric_loss = tf.keras.metrics.Mean(name="loss")
@@ -487,9 +494,9 @@ def _prepare_layers(self) -> None:
             self.config[SIMILARITY_TYPE],
         )
 
-    def _create_all_labels_embed(self) -> Tuple[np.ndarray, tf.Tensor]:
-        all_labels = self._encoded_all_label_ids.astype(np.float32)
-        all_labels_embed = self._embed_label(all_label)
+    def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_labels = self.tf_label_data["label_features"][0]
+        all_labels_embed = self._embed_label(all_labels)
 
         return all_labels, all_labels_embed
 
@@ -519,7 +526,7 @@ def _embed_label(self, label_in: Union[tf.Tensor, np.ndarray]) -> tf.Tensor:
         return self._tf_layers["embed.label"](label)
 
     def batch_loss(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         batch = self.batch_to_model_data_format(batch_in, self.data_signature)
 
@@ -545,7 +552,7 @@ def batch_loss(
         return loss
 
     def batch_predict(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
         batch = self.batch_to_model_data_format(batch_in, self.predict_data_signature)
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 2edde57eab3a..4c703d68b3b1 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -28,12 +28,12 @@ def __init__(self, *args, **kwargs):
         self._predict_function = None
 
     def batch_loss(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         raise NotImplementedError
 
     def batch_predict(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
 
@@ -117,7 +117,7 @@ def fit(
             logger.info("Finished training.")
 
     def train_on_batch(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> None:
         """Train on batch"""
 
@@ -176,7 +176,7 @@ def load(
         return model
 
     def _total_batch_loss(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         """Calculate total loss"""
 
@@ -288,7 +288,7 @@ def _should_evaluate(
 
     @staticmethod
     def batch_to_model_data_format(
-        batch: Union[List[tf.Tensor], List[np.ndarray]],
+        batch: Union[Tuple[tf.Tensor], Tuple[np.ndarray]],
         data_signature: Dict[Text, List[FeatureSignature]],
     ) -> Dict[Text, List[tf.Tensor]]:
         """Convert input batch tensors into batch data format.

From 4ee54fe105aee10ed3c3de10d3a57bf077f813c7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 24 Jan 2020 18:01:45 +0100
Subject: [PATCH 207/633] pass pad_mask to transformer

---
 rasa/core/policies/embedding_policy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 482823ec87ad..0d46a3d376f1 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -442,7 +442,7 @@ def __init__(
         # metrics
         self.metric_loss = tf.keras.metrics.Mean(name="loss")
         self.metric_acc = tf.keras.metrics.Mean(name="acc")
-        self.metrics_to_log = ["loss", "acc"]
+        self.metrics_to_log += ["loss", "acc"]
 
         # set up tf layers
         self._tf_layers = {}
@@ -509,7 +509,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
 
         dialogue = self._tf_layers["ffnn.dialogue"](dialogue_in, self._training)
         dialogue_transformed = self._tf_layers["transformer"](
-            dialogue, tf.expand_dims(mask, axis=-1), self._training
+            dialogue, 1 - tf.expand_dims(mask, axis=-1), self._training
         )
 
         if self.max_history_tracker_featurizer_used:

From e0c859ebcd0ab4386a2188e27009c876dcc0a24e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Sat, 25 Jan 2020 00:31:38 +0100
Subject: [PATCH 208/633] remove regularization from transformer encoder
 layers, fix setting training to true/false

---
 rasa/utils/tensorflow/tf_layers.py | 114 ++++++++++++++---------------
 rasa/utils/tensorflow/tf_models.py |  17 ++---
 2 files changed, 62 insertions(+), 69 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index d60cc8fcc0e5..9555694c4e77 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Text, Tuple, Callable
 import tensorflow as tf
 import tensorflow_addons as tfa
+from tensorflow.python.keras.utils import tf_utils
 import numpy as np
 
 logger = logging.getLogger(__name__)
@@ -9,13 +10,16 @@
 
 class SparseDropout(tf.keras.layers.Dropout):
     def call(self, inputs: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
+        def dropped_inputs():
+            to_retain_prob = tf.random.uniform(
+                tf.shape(inputs.values), 0, 1, inputs.values.dtype
+            )
+            to_retain = tf.greater_equal(to_retain_prob, self.rate)
+            return tf.sparse.retain(inputs, to_retain)
 
-        to_retain_prob = tf.random.uniform(
-            tf.shape(inputs.values), 0, 1, inputs.values.dtype
+        outputs = tf_utils.smart_cond(
+            training, dropped_inputs, lambda: tf.identity(inputs)
         )
-        to_retain = tf.greater_equal(to_retain_prob, self.rate)
-        dropped_inputs = tf.sparse.retain(inputs, to_retain)
-        outputs = tf.cond(training, lambda: dropped_inputs, lambda: inputs)
         # noinspection PyProtectedMember
         outputs._dense_shape = inputs._dense_shape
 
@@ -164,7 +168,7 @@ def _scaled_dot_product_attention(q, k, v, pad_mask):
 
         return output, attention_weights
 
-    def __init__(self, d_model: int, num_heads: int, reg_lambda: float) -> None:
+    def __init__(self, d_model: int, num_heads: int) -> None:
         super().__init__()
 
         self.num_heads = num_heads
@@ -174,17 +178,10 @@ def __init__(self, d_model: int, num_heads: int, reg_lambda: float) -> None:
 
         self._depth = d_model // self.num_heads
 
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self._wq = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l2_regularizer
-        )
-        self._wk = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l2_regularizer
-        )
-        self._wv = tf.keras.layers.Dense(
-            d_model, use_bias=False, kernel_regularizer=l2_regularizer
-        )
-        self._dense = tf.keras.layers.Dense(d_model, kernel_regularizer=l2_regularizer)
+        self._wq = tf.keras.layers.Dense(d_model, use_bias=False)
+        self._wk = tf.keras.layers.Dense(d_model, use_bias=False)
+        self._wv = tf.keras.layers.Dense(d_model, use_bias=False)
+        self._dense = tf.keras.layers.Dense(d_model)
 
     def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
         """Split the last dimension into (num_heads, depth).
@@ -242,29 +239,19 @@ def call(
 
 class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(
-        self,
-        d_model: int,
-        num_heads: int,
-        dff: int,
-        reg_lambda: float,
-        rate: float = 0.1,
+        self, d_model: int, num_heads: int, dff: int, rate: float = 0.1,
     ) -> None:
         super().__init__()
 
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self._mha = MultiHeadAttention(d_model, num_heads, reg_lambda)
+        self._mha = MultiHeadAttention(d_model, num_heads)
         self._dropout = tf.keras.layers.Dropout(rate)
 
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
-            tf.keras.layers.Dense(
-                dff, activation="relu", kernel_regularizer=l2_regularizer
-            ),  # (batch_size, seq_len, dff)
+            tf.keras.layers.Dense(dff, activation="relu"),  # (batch_size, seq_len, dff)
             tf.keras.layers.Dropout(rate),
-            tf.keras.layers.Dense(
-                d_model, kernel_regularizer=l2_regularizer
-            ),  # (batch_size, seq_len, d_model)
+            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
             tf.keras.layers.Dropout(rate),
         ]
 
@@ -339,7 +326,7 @@ def __init__(
         self._dropout = tf.keras.layers.Dropout(rate)
 
         self._enc_layers = [
-            TransformerEncoderLayer(d_model, num_heads, dff, reg_lambda, rate)
+            TransformerEncoderLayer(d_model, num_heads, dff, rate)
             for _ in range(num_layers)
         ]
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
@@ -385,42 +372,49 @@ def call(
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Randomly mask input sequences."""
 
-        # do not substitute with cls token
-        pad_mask_up_to_last = tf.math.cumprod(
-            1 - mask, axis=1, exclusive=True, reverse=True
-        )
-        mask_up_to_last = 1 - pad_mask_up_to_last
+        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
+        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
 
-        x_random_pad = (
-            tf.random.uniform(tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype)
-            * pad_mask_up_to_last
-        )
-        # shuffle over batch dim
-        x_shuffle = tf.random.shuffle(x * mask_up_to_last + x_random_pad)
+        def x_masked():
+            # do not substitute with cls token
+            pad_mask_up_to_last = tf.math.cumprod(
+                1 - mask, axis=1, exclusive=True, reverse=True
+            )
+            mask_up_to_last = 1 - pad_mask_up_to_last
 
-        # shuffle over sequence dim
-        x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
-        x_shuffle = tf.random.shuffle(x_shuffle)
-        x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
+            x_random_pad = (
+                tf.random.uniform(
+                    tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype
+                )
+                * pad_mask_up_to_last
+            )
+            # shuffle over batch dim
+            x_shuffle = tf.random.shuffle(x * mask_up_to_last + x_random_pad)
 
-        # shuffle doesn't support backprop
-        x_shuffle = tf.stop_gradient(x_shuffle)
+            # shuffle over sequence dim
+            x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
+            x_shuffle = tf.random.shuffle(x_shuffle)
+            x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
 
-        mask_vector = tf.tile(self.mask_vector, (tf.shape(x)[0], tf.shape(x)[1], 1))
+            # shuffle doesn't support backprop
+            x_shuffle = tf.stop_gradient(x_shuffle)
 
-        other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
-        other_prob = tf.tile(other_prob, (1, 1, x.shape[-1]))
-        x_other = tf.where(
-            other_prob < 0.70, mask_vector, tf.where(other_prob < 0.80, x_shuffle, x)
-        )
+            mask_vector = tf.tile(self.mask_vector, (tf.shape(x)[0], tf.shape(x)[1], 1))
 
-        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
-        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
-        x_masked = tf.where(tf.tile(lm_mask_bool, (1, 1, x.shape[-1])), x_other, x)
+            other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
+            other_prob = tf.tile(other_prob, (1, 1, x.shape[-1]))
+            x_other = tf.where(
+                other_prob < 0.70,
+                mask_vector,
+                tf.where(other_prob < 0.80, x_shuffle, x),
+            )
 
-        x_masked = tf.cond(training, lambda: x_masked, lambda: x)
+            return tf.where(tf.tile(lm_mask_bool, (1, 1, x.shape[-1])), x_other, x)
 
-        return x_masked, lm_mask_bool
+        return (
+            tf_utils.smart_cond(training, x_masked, lambda: tf.identity(x)),
+            lm_mask_bool,
+        )
 
 
 class CRF(tf.keras.layers.Layer):
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 4c703d68b3b1..7f3e80de97fb 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -23,7 +23,7 @@ def __init__(self, *args, **kwargs):
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
         self.metrics_to_log = ["t_loss"]
 
-        self._training = tf.ones((), tf.bool)
+        self._training = None  # training phase should be defined when building a graph
 
         self._predict_function = None
 
@@ -37,12 +37,6 @@ def batch_predict(
     ) -> Dict[Text, tf.Tensor]:
         raise NotImplementedError
 
-    def set_training_phase(self, training: bool) -> None:
-        if training:
-            self._training = tf.ones((), tf.bool)
-        else:
-            self._training = tf.zeros((), tf.bool)
-
     def fit(
         self,
         model_data: RasaModelData,
@@ -113,6 +107,7 @@ def fit(
 
             pbar.set_postfix(postfix_dict)
 
+        self._training = None  # training phase should be defined when building a graph
         if not disable:
             logger.info("Finished training.")
 
@@ -135,6 +130,7 @@ def predict_dataset_function(  # to reuse the same helper method
         ) -> tf.data.Dataset:
             return predict_data.as_tf_dataset(_batch_size, "sequence", shuffle=False)
 
+        self._training = False  # needed for tf graph mode
         _, self._predict_function = self._get_tf_functions(
             predict_dataset_function, self.batch_predict, eager, "prediction"
         )
@@ -146,7 +142,8 @@ def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
 
         predict_dataset = predict_data.as_tf_dataset(batch_size=1)
         batch_in = next(iter(predict_dataset))
-        self.set_training_phase(False)
+
+        self._training = False  # needed for eager mode
         return self._predict_function(batch_in)
 
     def save(self, model_file_name: Text) -> None:
@@ -197,7 +194,7 @@ def _batch_loop(
         """Run on batches"""
 
         self.reset_metrics()
-        self.set_training_phase(training)
+        self._training = training  # needed for eager mode
         for batch_in in dataset_function(batch_size):
             call_model_function(batch_in)
 
@@ -238,6 +235,7 @@ def train_dataset_function(
         ) -> tf.data.Dataset:
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
 
+        self._training = True  # needed for tf graph mode
         return self._get_tf_functions(
             train_dataset_function, self.train_on_batch, eager, "train"
         )
@@ -259,6 +257,7 @@ def evaluation_dataset_function(
                     _batch_size, "sequence", shuffle=False
                 )
 
+            self._training = False  # needed for tf graph mode
             return self._get_tf_functions(
                 evaluation_dataset_function, self._total_batch_loss, eager, "evaluation"
             )

From f1ced21c0fdcb2d6feda49841803101d591bf5a0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 09:32:40 +0100
Subject: [PATCH 209/633] update tests #1

---
 tests/core/test_policies.py                   |  2 +-
 tests/nlu/base/test_config.py                 |  2 +-
 .../test_embedding_intent_classifier.py       | 33 +------------------
 tests/utils/test_tf_model_data.py             |  6 ++++
 4 files changed, 9 insertions(+), 34 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 30c4c2e940bb..6eeb22ed5e07 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -335,7 +335,7 @@ def create_policy(self, featurizer, priority):
         return p
 
     def test_similarity_type(self, trained_policy):
-        assert trained_policy.similarity_type == "inner"
+        assert trained_policy.config[SIMILARITY_TYPE] == "inner"
 
     def test_ranking_length(self, trained_policy):
         assert trained_policy.ranking_length == 10
diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index be729075adb3..a937d0dbf067 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -80,4 +80,4 @@ def test_override_defaults_supervised_embeddings_pipeline():
 
     component2_cfg = cfg.for_component(1)
     component2 = builder.create_component(component2_cfg, cfg)
-    assert component2.epochs == 10
+    assert component2.component_config["epochs"] == 10
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
index e45dbfed1f51..67a991741d47 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -38,37 +38,6 @@ def test_compute_default_label_features():
         assert o.shape == (1, len(label_features))
 
 
-def test_get_num_of_features():
-    session_data = {
-        "text_features": [
-            np.array(
-                [
-                    np.random.rand(5, 14),
-                    np.random.rand(2, 14),
-                    np.random.rand(3, 14),
-                    np.random.rand(1, 14),
-                    np.random.rand(3, 14),
-                ]
-            ),
-            np.array(
-                [
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                ]
-            ),
-        ]
-    }
-
-    num_features = EmbeddingIntentClassifier._get_num_of_features(
-        session_data, "text_features"
-    )
-
-    assert num_features == 24
-
-
 @pytest.mark.parametrize(
     "messages, expected",
     [
@@ -243,7 +212,7 @@ async def test_softmax_normalization(
     [({"loss_type": "margin", "random_seed": 42}, LABEL_RANKING_LENGTH)],
 )
 async def test_margin_loss_is_not_normalized(
-    monkeypatch, component_builder, tmpdir, classifier_params, output_length,
+    monkeypatch, component_builder, tmpdir, classifier_params, output_length
 ):
     pipeline = as_pipeline(
         "WhitespaceTokenizer", "CountVectorsFeaturizer", "EmbeddingIntentClassifier"
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index b035cc6eef50..ed12c623b0a3 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -148,3 +148,9 @@ def test_balance_session_data(model_data: RasaModelData):
     data = model_data.balanced_data(model_data.data, 2, False)
 
     assert np.all(data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))
+
+
+def test_get_num_of_features(model_data: RasaModelData):
+    num_features = model_data.get_feature_dimension("text_features")
+
+    assert num_features == 24

From b3fb6b6da1046d165a1b7e5d4cc98cbbc24bc448 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 11:13:18 +0100
Subject: [PATCH 210/633] update tests #2

---
 tests/core/test_policies.py | 58 ++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 6eeb22ed5e07..09eef4d736e2 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -39,6 +39,7 @@
     DEFAULT_STORIES_FILE,
 )
 from tests.core.utilities import get_tracker, read_dialogue_file, user_uttered
+from utils.tensorflow.constants import SIMILARITY_TYPE
 
 
 def tf_defaults():
@@ -174,16 +175,17 @@ def test_persist_and_load_empty_policy(self, tmpdir):
         loaded = empty_policy.__class__.load(tmpdir.strpath)
         assert loaded is not None
 
-    def test_tf_config(self, trained_policy, tmpdir):
-        if hasattr(trained_policy, "session"):
-            import tensorflow as tf
-
-            # noinspection PyProtectedMember
-            assert trained_policy.session._config == tf.Session()._config
-            trained_policy.persist(tmpdir.strpath)
-            loaded = trained_policy.__class__.load(tmpdir.strpath)
-            # noinspection PyProtectedMember
-            assert loaded.session._config == tf.Session()._config
+    # TODO test tf config
+    # def test_tf_config(self, trained_policy, tmpdir):
+    #     if hasattr(trained_policy, "session"):
+    #         import tensorflow as tf
+    #
+    #         # noinspection PyProtectedMember
+    #         assert trained_policy.session._config == tf.Session()._config
+    #         trained_policy.persist(tmpdir.strpath)
+    #         loaded = trained_policy.__class__.load(tmpdir.strpath)
+    #         # noinspection PyProtectedMember
+    #         assert loaded.session._config == tf.Session()._config
 
     @staticmethod
     def _get_next_action(policy, events, domain):
@@ -205,13 +207,14 @@ def create_policy(self, featurizer, priority):
         p = KerasPolicy(featurizer, priority, **tf_defaults())
         return p
 
-    def test_tf_config(self, trained_policy, tmpdir):
-        # noinspection PyProtectedMember
-        assert trained_policy.session._config == session_config()
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
-        # noinspection PyProtectedMember
-        assert loaded.session._config == session_config()
+    # TODO test tf config
+    # def test_tf_config(self, trained_policy, tmpdir):
+    #     # noinspection PyProtectedMember
+    #     assert trained_policy.session._config == session_config()
+    #     trained_policy.persist(tmpdir.strpath)
+    #     loaded = trained_policy.__class__.load(tmpdir.strpath)
+    #     # noinspection PyProtectedMember
+    #     assert loaded.session._config == session_config()
 
 
 class TestSklearnPolicy(PolicyTestCollection):
@@ -365,33 +368,28 @@ async def test_gen_batch(self, trained_policy, default_domain):
         training_data = trained_policy.featurize_for_training(
             training_trackers, default_domain
         )
-        session_data = trained_policy._create_session_data(
-            training_data.X, training_data.y
-        )
+        model_data = trained_policy._create_modeldata(training_data.X, training_data.y)
         batch_size = 2
         batch_x, batch_y, _ = next(
-            train_utils.gen_batch(
-                session_data=session_data, batch_size=batch_size, label_key="action_ids"
-            )
+            model_data.gen_batch(batch_size=batch_size, label_key="label_ids")
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data["dialogue_features"][0][0].shape
-            and batch_y[0].shape == session_data["bot_features"][0][0].shape
+            batch_x[0].shape == model_data.get("dialogue_features")[0][0].shape
+            and batch_y[0].shape == model_data.get("label_features")[0][0].shape
         )
         batch_x, batch_y, _ = next(
-            train_utils.gen_batch(
-                session_data=session_data,
+            model_data.gen_batch(
                 batch_size=batch_size,
-                label_key="action_ids",
+                label_key="label_ids",
                 batch_strategy="balanced",
                 shuffle=True,
             )
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data["dialogue_features"][0][0].shape
-            and batch_y[0].shape == session_data["bot_features"][0][0].shape
+            batch_x[0].shape == model_data.get("dialogue_features")[0][0].shape
+            and batch_y[0].shape == model_data.get("label_features")[0][0].shape
         )
 
 

From c2f5d2642fa504b2e9d35f3ac207b8d58fb29525 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 11:17:49 +0100
Subject: [PATCH 211/633] review comments

---
 rasa/core/policies/embedding_policy.py             | 14 +++-----------
 .../nlu/classifiers/embedding_intent_classifier.py | 14 ++++----------
 rasa/utils/train_utils.py                          | 11 +++++++++++
 3 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 0d46a3d376f1..3e6ad4d0b42e 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -143,11 +143,7 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
         self.config.update(kwargs)
 
-        if self.config[SIMILARITY_TYPE] == "auto":
-            if self.config[LOSS_TYPE] == "softmax":
-                self.config[SIMILARITY_TYPE] = "inner"
-            elif self.config[LOSS_TYPE] == "margin":
-                self.config[SIMILARITY_TYPE] = "cosine"
+        self.config = train_utils.update_auto_similarity_type(self.config)
 
         if self.config[EVAL_NUM_EPOCHS] < 1:
             self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
@@ -215,7 +211,7 @@ def _create_label_data(self, domain: Domain) -> RasaModelData:
         all_labels = state_featurizer.create_encoded_all_actions(domain)
         all_labels = all_labels.astype(np.float32)
 
-        label_data = RasaModelData(label_key="label_features")
+        label_data = RasaModelData()
         label_data.add_features("label_features", [all_labels])
         return label_data
 
@@ -378,11 +374,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         with open(os.path.join(path, file_name + ".meta.pkl"), "rb") as f:
             meta = pickle.load(f)
 
-        if meta[SIMILARITY_TYPE] == "auto":
-            if meta[LOSS_TYPE] == "softmax":
-                meta[SIMILARITY_TYPE] = "inner"
-            elif meta[LOSS_TYPE] == "margin":
-                meta[SIMILARITY_TYPE] = "cosine"
+        meta = train_utils.update_auto_similarity_type(meta)
 
         model = TED.load(
             tf_model_file,
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index cf12b2f0e4ba..075b20cc5f92 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -156,11 +156,9 @@ def _check_config_parameters(self) -> None:
                 "hidden_layer_sizes for text and label must coincide."
             )
 
-        if self.component_config[SIMILARITY_TYPE] == "auto":
-            if self.component_config[LOSS_TYPE] == "softmax":
-                self.component_config[SIMILARITY_TYPE] = "inner"
-            elif self.component_config[LOSS_TYPE] == "margin":
-                self.component_config[SIMILARITY_TYPE] = "cosine"
+        self.component_config = train_utils.update_auto_similarity_type(
+            self.component_config
+        )
 
         if self.component_config[EVAL_NUM_EPOCHS] < 1:
             self.component_config[EVAL_NUM_EPOCHS] = self.component_config[EPOCHS]
@@ -764,11 +762,7 @@ def load(
         ) as f:
             batch_tuple_sizes = pickle.load(f)
 
-        if meta[SIMILARITY_TYPE] == "auto":
-            if meta[LOSS_TYPE] == "softmax":
-                meta[SIMILARITY_TYPE] = "inner"
-            elif meta[LOSS_TYPE] == "margin":
-                meta[SIMILARITY_TYPE] = "cosine"
+        meta = train_utils.update_auto_similarity_type(meta)
 
         model = DIET.load(
             tf_model_file,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e71f0ee69fc4..2ae6c4e84871 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -3,6 +3,7 @@
 import numpy as np
 from typing import Optional, Text, Dict, Any
 
+from rasa.utils.tensorflow.constants import SIMILARITY_TYPE, LOSS_TYPE
 
 logger = logging.getLogger(__name__)
 
@@ -30,3 +31,13 @@ def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarr
         new_values = new_values / np.sum(new_values)
 
     return new_values
+
+
+def update_auto_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    if config[SIMILARITY_TYPE] == "auto":
+        if config[LOSS_TYPE] == "softmax":
+            config[SIMILARITY_TYPE] = "inner"
+        elif config[LOSS_TYPE] == "margin":
+            config[SIMILARITY_TYPE] = "cosine"
+
+    return config

From 0c6a1daa20a004922acb27b6bdecc78023055235 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 13:10:30 +0100
Subject: [PATCH 212/633] fix imports

---
 rasa/core/policies/embedding_policy.py        | 29 +++++++++++++-
 .../embedding_intent_classifier.py            | 38 ++++++++++++++++++-
 .../selectors/embedding_response_selector.py  | 36 +++++++++++++++++-
 rasa/utils/tensorflow/tf_layers.py            |  4 +-
 tests/core/test_policies.py                   |  2 +-
 5 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 3e6ad4d0b42e..2a1764d2de40 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -23,7 +23,34 @@
 from rasa.utils.tensorflow import tf_layers
 from rasa.utils.tensorflow.tf_models import RasaModel
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
-from rasa.utils.tensorflow.constants import *
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_LABEL,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    POS_ENCODING,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    C_EMB,
+    C2,
+    SCALE_LOSS,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+    HIDDEN_LAYERS_SIZES_DIALOGUE,
+    DROPRATE_DIALOGUE,
+    DROPRATE_LABEL,
+)
 
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 075b20cc5f92..fa9bb843ff4c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -7,7 +7,7 @@
 import tensorflow as tf
 import tensorflow_addons as tfa
 
-from typing import Any, Dict, List, Optional, Text, Tuple, Union, Callable
+from typing import Any, Dict, List, Optional, Text, Tuple, Union
 
 import rasa.utils.io
 from rasa.nlu.extractors import EntityExtractor
@@ -19,7 +19,6 @@
 from rasa.utils.tensorflow import tf_layers
 from rasa.utils.tensorflow.tf_models import RasaModel
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
-from rasa.utils.tensorflow.constants import *
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -32,6 +31,41 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_TEXT,
+    HIDDEN_LAYERS_SIZES_LABEL,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    POS_ENCODING,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIM,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROPRATE,
+    C_EMB,
+    C2,
+    SCALE_LOSS,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+)
 
 logger = logging.getLogger(__name__)
 
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 6fd00a6c0691..6e0c5ad21800 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -4,7 +4,41 @@
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.components import any_of
-from rasa.utils.tensorflow.constants import *
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_TEXT,
+    HIDDEN_LAYERS_SIZES_LABEL,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    POS_ENCODING,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIM,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROPRATE,
+    C_EMB,
+    C2,
+    SCALE_LOSS,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+)
 from rasa.nlu.constants import (
     RESPONSE_ATTRIBUTE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 9555694c4e77..f637545072ee 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -239,7 +239,7 @@ def call(
 
 class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(
-        self, d_model: int, num_heads: int, dff: int, rate: float = 0.1,
+        self, d_model: int, num_heads: int, dff: int, rate: float = 0.1
     ) -> None:
         super().__init__()
 
@@ -257,6 +257,8 @@ def __init__(
 
     def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
 
+        tf.print(training)
+
         x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
         attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
         attn_out = self._dropout(attn_out, training=training)
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 09eef4d736e2..74f770a3cfd9 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -32,6 +32,7 @@
 from rasa.core.policies.memoization import AugmentedMemoizationPolicy, MemoizationPolicy
 from rasa.core.policies.sklearn_policy import SklearnPolicy
 from rasa.core.trackers import DialogueStateTracker
+from rasa.utils.tensorflow.constants import SIMILARITY_TYPE
 from rasa.utils import train_utils
 from tests.core.conftest import (
     DEFAULT_DOMAIN_PATH_WITH_MAPPING,
@@ -39,7 +40,6 @@
     DEFAULT_STORIES_FILE,
 )
 from tests.core.utilities import get_tracker, read_dialogue_file, user_uttered
-from utils.tensorflow.constants import SIMILARITY_TYPE
 
 
 def tf_defaults():

From e9384f1392ceca95315f2e0e3561b49e61b38cae Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 14:37:10 +0100
Subject: [PATCH 213/633] fix make lint errors

---
 rasa/core/policies/keras_policy.py | 10 ----------
 rasa/utils/tensorflow/tf_models.py |  2 +-
 setup.cfg                          |  2 +-
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 2f6cd61bb49c..5397266998cf 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -71,13 +71,10 @@ def __init__(
         self.current_epoch = current_epoch
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
-        from rasa.utils.train_utils import load_tf_config
-
         config = copy.deepcopy(self.defaults)
         config.update(kwargs)
 
         # filter out kwargs that are used explicitly
-        self._tf_config = load_tf_config(config)
         self.rnn_size = config.pop("rnn_size")
         self.epochs = config.pop("epochs")
         self.batch_size = config.pop("batch_size")
@@ -273,9 +270,6 @@ def persist(self, path: Text) -> None:
             rasa.utils.io.create_directory_for_file(model_file)
             self.model.save(model_file, overwrite=True)
 
-            tf_config_file = os.path.join(path, "keras_policy.tf_config.pkl")
-            with open(tf_config_file, "wb") as f:
-                pickle.dump(self._tf_config, f)
         else:
             warnings.warn(
                 "Method `persist(...)` was called "
@@ -293,10 +287,6 @@ def load(cls, path: Text) -> "KerasPolicy":
             if os.path.isfile(meta_file):
                 meta = json.loads(rasa.utils.io.read_file(meta_file))
 
-                tf_config_file = os.path.join(path, "keras_policy.tf_config.pkl")
-                with open(tf_config_file, "rb") as f:
-                    _tf_config = pickle.load(f)
-
                 model_file = os.path.join(path, meta["model"])
 
                 with warnings.catch_warnings():
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 7f3e80de97fb..d607cda75685 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -291,7 +291,7 @@ def batch_to_model_data_format(
         data_signature: Dict[Text, List[FeatureSignature]],
     ) -> Dict[Text, List[tf.Tensor]]:
         """Convert input batch tensors into batch data format.
-    
+
         Batch contains any number of batch data. The order is equal to the
         key-value pairs in session data. As sparse data were converted into indices, data,
         shape before, this methods converts them into sparse tensors. Dense data is
diff --git a/setup.cfg b/setup.cfg
index 1efcf0481b85..1370760eb4b8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,4 +23,4 @@ license_file = LICENSE.txt
 
 [flake8]
 max-line-length = 88
-ignore = W503, E121, E126, E211, E225, E501, E203, E402, F401, F811, E231
+ignore = W503, E121, E126, E211, E225, E501, E203, E402, F401, F811, E231, F901

From 9b7f90a977248187c12f354ab90c1a59e952a394 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 15:33:02 +0100
Subject: [PATCH 214/633] resolve error of pytype check

---
 .../embedding_intent_classifier.py            | 47 ++++++++++++-------
 rasa/utils/tensorflow/tf_model_data.py        |  8 ++--
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fa9bb843ff4c..dde7227fa8be 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -231,8 +231,6 @@ def __init__(
         # number of entity tags
         self.num_tags = 0
 
-        self._tf_config = train_utils.load_tf_config(self.component_config)
-
         self.data_example = None
 
     # training data helpers:
@@ -741,9 +739,6 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         with open(os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "wb") as f:
             pickle.dump(self.inverted_tag_dict, f)
 
-        with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "wb") as f:
-            pickle.dump(self._tf_config, f)
-
         with open(
             os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "wb"
         ) as f:
@@ -772,9 +767,6 @@ def load(
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        # with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
-        #    _tf_config = pickle.load(f)
-
         with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "rb") as f:
             model_data_example = RasaModelData(
                 label_key="label_ids", data=pickle.load(f)
@@ -1022,11 +1014,15 @@ def _combine_sparse_dense_features(
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 if sparse_dropout:
-                    _f = self._tf_layers["sparse_dropout"](f, self._training)
+                    _f = self._tf_layers["sparse_dropout"](
+                        f, self._training
+                    )  # pytype: disable=key-error
                 else:
                     _f = f
 
-                dense_features.append(self._tf_layers[f"sparse_to_dense.{name}"](_f))
+                dense_features.append(
+                    self._tf_layers[f"sparse_to_dense.{name}"](_f)
+                )  # pytype: disable=key-error
             else:
                 dense_features.append(f)
 
@@ -1055,11 +1051,15 @@ def _create_sequence(
         )
 
         if masked_lm_loss:
-            pre, lm_mask_bool = self._tf_layers["input_mask"](x, mask, self._training)
+            pre, lm_mask_bool = self._tf_layers["input_mask"](
+                x, mask, self._training
+            )  # pytype: disable=key-error
         else:
             pre, lm_mask_bool = (x, None)
 
-        transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
+        transformed = self._tf_layers["transformer"](
+            pre, 1 - mask, self._training
+        )  # pytype: disable=key-error
         transformed = tf.nn.relu(transformed)
 
         return transformed, x, lm_mask_bool
@@ -1070,7 +1070,9 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
             self.tf_label_data["label_mask"][0],
             "label",
         )
-        all_labels_embed = self._tf_layers["embed.label"](all_labels)
+        all_labels_embed = self._tf_layers["embed.label"](
+            all_labels
+        )  # pytype: disable=key-error
 
         return all_labels, all_labels_embed
 
@@ -1094,22 +1096,26 @@ def _mask_loss(
         a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
         a_masked = tf.boolean_mask(a, lm_mask_bool)
 
+        # pytype: disable=key-error
         a_t_masked_embed = self._tf_layers["embed.lm_mask"](a_t_masked)
         a_masked_embed = self._tf_layers["embed.golden_token"](a_masked)
 
         return self._tf_layers["loss.mask"](
             a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
         )
+        # pytype: enable=key-error
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         all_labels, all_labels_embed = self._create_all_labels()
 
+        # pytype: disable=key-error
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
 
         return self._tf_layers["loss.label"](
             a_embed, b_embed, b, all_labels_embed, all_labels
         )
+        # pytype: enable=key-error
 
     def _entity_loss(
         self, a: tf.Tensor, c: tf.Tensor, mask: tf.Tensor, sequence_lengths
@@ -1118,10 +1124,13 @@ def _entity_loss(
         # remove cls token
         sequence_lengths = sequence_lengths - 1
         c = tf.cast(c[:, :, 0], tf.int32)
+
+        # pytype: disable=key-error
         logits = self._tf_layers["embed.logits"](a)
 
         loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
+        # pytype: enable=key-error
 
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
@@ -1133,12 +1142,14 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        f1 = self._tf_layers["crf_f1_score"](c_masked_1, pred_ids_masked_1)
+        f1 = self._tf_layers["crf_f1_score"](
+            c_masked_1, pred_ids_masked_1
+        )  # pytype: disable=key-error
 
         return loss, f1
 
     def batch_loss(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
@@ -1182,7 +1193,7 @@ def batch_loss(
         return tf.math.add_n(losses)
 
     def batch_predict(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
@@ -1202,6 +1213,7 @@ def batch_predict(
 
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
+            # pytype: disable=key-error
             cls_embed = self._tf_layers["embed.text"](cls)
 
             sim_all = self._tf_layers["loss.label"].sim(
@@ -1210,11 +1222,14 @@ def batch_predict(
             scores = self._tf_layers["loss.label"].confidence_from_sim(
                 sim_all, self.config[SIMILARITY_TYPE]
             )
+            # pytype: enable=key-error
             out["i_scores"] = scores
 
         if self.config[ENTITY_RECOGNITION]:
+            # pytype: disable=key-error
             logits = self._tf_layers["embed.logits"](text_transformed)
             pred_ids = self._tf_layers["crf"](logits, sequence_lengths - 1)
+            # pytype: enable=key-error
             out["e_ids"] = pred_ids
 
         return out
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index b9072be1bc06..dc8412d30f17 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -16,9 +16,7 @@ class FeatureSignature(NamedTuple):
 
 
 class RasaModelData:
-    def __init__(
-        self, label_key: Optional[Text] = None, data: Data = None,
-    ):
+    def __init__(self, label_key: Optional[Text] = None, data: Data = None):
         self.data = data or {}
         self.label_key = label_key or ""
         # will be updated when features are added
@@ -377,6 +375,10 @@ def _data_for_ids(data: Data, ids: np.ndarray) -> Dict[Text, List[np.ndarray]]:
         """Filter session data by ids."""
 
         new_data = defaultdict(list)
+
+        if data is None:
+            return new_data
+
         for k, values in data.items():
             for v in values:
                 new_data[k].append(v[ids])

From 5a39e3b1bda686350215b09f5a5002ef78af5e8e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 15:52:50 +0100
Subject: [PATCH 215/633] rename EmbeddingPolicy to TEDPolicy

---
 .../{embedding_policy.py => TED_policy.py}    |  4 ++--
 rasa/core/policies/registry.py                |  2 +-
 .../embedding_intent_classifier.py            |  3 +++
 .../selectors/embedding_response_selector.py  |  4 ++++
 tests/core/test_policies.py                   | 24 ++++++++-----------
 5 files changed, 20 insertions(+), 17 deletions(-)
 rename rasa/core/policies/{embedding_policy.py => TED_policy.py} (99%)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/TED_policy.py
similarity index 99%
rename from rasa/core/policies/embedding_policy.py
rename to rasa/core/policies/TED_policy.py
index 2a1764d2de40..6654792dc3e8 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -56,7 +56,7 @@
 logger = logging.getLogger(__name__)
 
 
-class EmbeddingPolicy(Policy):
+class TEDPolicy(Policy):
     """Transformer Embedding Dialogue Policy (TEDP)
 
     Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
@@ -373,7 +373,7 @@ def persist(self, path: Text):
             pickle.dump(self._label_data, f)
 
     @classmethod
-    def load(cls, path: Text) -> "EmbeddingPolicy":
+    def load(cls, path: Text) -> "TEDPolicy":
         """Loads a policy from the storage.
 
         **Needs to load its featurizer**
diff --git a/rasa/core/policies/registry.py b/rasa/core/policies/registry.py
index 6d11c97b3b0f..1c26d9ce21a5 100644
--- a/rasa/core/policies/registry.py
+++ b/rasa/core/policies/registry.py
@@ -2,7 +2,7 @@
 # path. Don't do this in `__init__.py` to avoid importing them without need.
 
 # noinspection PyUnresolvedReferences
-from rasa.core.policies.embedding_policy import EmbeddingPolicy
+from rasa.core.policies.TED_policy import TEDPolicy
 
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.fallback import FallbackPolicy
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index dde7227fa8be..a27ce84e1f89 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -173,7 +173,10 @@ class EmbeddingIntentClassifier(EntityExtractor):
         INTENT_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
         ENTITY_RECOGNITION: True,
+        # if true random tokens of the input message will be masked and the model
+        # should predict those tokens
         MASKED_LM: False,
+        # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
     }
     # end default properties (DOC MARKER - don't remove)
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 6e0c5ad21800..3d5a55fd0d52 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -162,8 +162,12 @@ class ResponseSelector(EmbeddingIntentClassifier):
         # if true intent classification is trained and intent predicted
         INTENT_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
+        # (should always be false)
         ENTITY_RECOGNITION: False,
+        # if true random tokens of the input message will be masked and the model
+        # should predict those tokens
         MASKED_LM: False,
+        # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
     }
 
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 74f770a3cfd9..04510136968c 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -24,7 +24,7 @@
     FullDialogueTrackerFeaturizer,
 )
 from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy
-from rasa.core.policies.embedding_policy import EmbeddingPolicy
+from rasa.core.policies.TED_policy import TEDPolicy
 from rasa.core.policies.fallback import FallbackPolicy
 from rasa.core.policies.form_policy import FormPolicy
 from rasa.core.policies.keras_policy import KerasPolicy
@@ -334,7 +334,7 @@ def test_train_with_shuffle_false(
 
 class TestEmbeddingPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(featurizer=featurizer, priority=priority)
+        p = TEDPolicy(featurizer=featurizer, priority=priority)
         return p
 
     def test_similarity_type(self, trained_policy):
@@ -395,7 +395,7 @@ async def test_gen_batch(self, trained_policy, default_domain):
 
 class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
+        p = TEDPolicy(
             featurizer=featurizer, priority=priority, **{"loss_type": "margin"}
         )
         return p
@@ -415,7 +415,7 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
 
 class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
+        p = TEDPolicy(
             featurizer=featurizer,
             priority=priority,
             **{"scale_loss": False, "evaluate_on_num_examples": 4},
@@ -425,9 +425,7 @@ def create_policy(self, featurizer, priority):
 
 class TestEmbeddingPolicyNoNormalization(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"ranking_length": 0}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{"ranking_length": 0})
         return p
 
     def test_ranking_length(self, trained_policy):
@@ -451,9 +449,7 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
 
 class TestEmbeddingPolicyLowRankingLength(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"ranking_length": 3}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{"ranking_length": 3})
         return p
 
     def test_ranking_length(self, trained_policy):
@@ -462,7 +458,7 @@ def test_ranking_length(self, trained_policy):
 
 class TestEmbeddingPolicyHighRankingLength(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
+        p = TEDPolicy(
             featurizer=featurizer, priority=priority, **{"ranking_length": 11}
         )
         return p
@@ -476,7 +472,7 @@ def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using FullDialogueTrackerFeaturizer
         # if max_history is not specified
-        p = EmbeddingPolicy(priority=priority)
+        p = TEDPolicy(priority=priority)
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
@@ -498,7 +494,7 @@ def create_policy(self, featurizer, priority):
         # use standard featurizer from EmbeddingPolicy,
         # since it is using MaxHistoryTrackerFeaturizer
         # if max_history is specified
-        p = EmbeddingPolicy(priority=priority, max_history=self.max_history)
+        p = TEDPolicy(priority=priority, max_history=self.max_history)
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
@@ -519,7 +515,7 @@ def test_featurizer(self, trained_policy, tmpdir):
 
 class TestEmbeddingPolicyWithTfConfig(TestEmbeddingPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
         return p
 
     def test_tf_config(self, trained_policy, tmpdir):

From b236a15505cb1e8d782ec6ff2558b3de07106ed6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 15:55:52 +0100
Subject: [PATCH 216/633] rename EmbeddingIntentClassifier to DIETClassifier

---
 ...embedding_intent_classifier.py => DIET_classifier.py} | 6 +++---
 rasa/nlu/registry.py                                     | 4 ++--
 rasa/nlu/selectors/embedding_response_selector.py        | 4 ++--
 .../nlu/classifiers/test_embedding_intent_classifier.py  | 9 +++------
 4 files changed, 10 insertions(+), 13 deletions(-)
 rename rasa/nlu/classifiers/{embedding_intent_classifier.py => DIET_classifier.py} (99%)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
similarity index 99%
rename from rasa/nlu/classifiers/embedding_intent_classifier.py
rename to rasa/nlu/classifiers/DIET_classifier.py
index a27ce84e1f89..f272e05130c7 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -70,7 +70,7 @@
 logger = logging.getLogger(__name__)
 
 
-class EmbeddingIntentClassifier(EntityExtractor):
+class DIETClassifier(EntityExtractor):
     """label classifier using supervised embeddings.
 
     The embedding intent classifier embeds user inputs
@@ -755,9 +755,9 @@ def load(
         meta: Dict[Text, Any],
         model_dir: Text = None,
         model_metadata: "Metadata" = None,
-        cached_component: Optional["EmbeddingIntentClassifier"] = None,
+        cached_component: Optional["DIETClassifier"] = None,
         **kwargs: Any,
-    ) -> "EmbeddingIntentClassifier":
+    ) -> "DIETClassifier":
         """Loads the trained model from the provided directory."""
 
         if not model_dir or not meta.get("file"):
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index bacc27dd6070..3a4cab41bb8c 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -9,7 +9,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Type
 
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
 from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
@@ -76,7 +76,7 @@
     SklearnIntentClassifier,
     MitieIntentClassifier,
     KeywordIntentClassifier,
-    EmbeddingIntentClassifier,
+    DIETClassifier,
     # selectors
     ResponseSelector,
 ]
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 3d5a55fd0d52..3f3356354ca6 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, Text
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_TEXT,
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ResponseSelector(EmbeddingIntentClassifier):
+class ResponseSelector(DIETClassifier):
     """Response selector using supervised embeddings.
 
     The response selector embeds user inputs
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
index 67a991741d47..51ae5c666b68 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -13,7 +13,7 @@
     DENSE_FEATURE_NAMES,
     INTENT_ATTRIBUTE,
 )
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
 from rasa.nlu.model import Interpreter
 from rasa.nlu.training_data import Message
 from rasa.utils import train_utils
@@ -28,7 +28,7 @@ def test_compute_default_label_features():
         Message("test d"),
     ]
 
-    output = EmbeddingIntentClassifier._compute_default_label_features(label_features)
+    output = DIETClassifier._compute_default_label_features(label_features)
 
     output = output[0]
 
@@ -77,10 +77,7 @@ def test_compute_default_label_features():
 def test_check_labels_features_exist(messages, expected):
     attribute = TEXT_ATTRIBUTE
 
-    assert (
-        EmbeddingIntentClassifier._check_labels_features_exist(messages, attribute)
-        == expected
-    )
+    assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected
 
 
 async def test_train(component_builder, tmpdir):

From 34783ee0ca5baa5e1fb4b85d22da0953ac1c2e71 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 16:54:10 +0100
Subject: [PATCH 217/633] Fix docs

---
 docs/core/policies.rst                        | 10 ++++-----
 docs/nlu/components.rst                       | 14 ++++++------
 tests/core/test_policies.py                   | 22 +++++++++----------
 tests/nlu/base/test_classifiers.py            |  1 -
 ..._classifier.py => test_DIET_classifier.py} | 17 ++++++--------
 tests/nlu/training/test_train.py              |  2 +-
 6 files changed, 31 insertions(+), 35 deletions(-)
 rename tests/nlu/classifiers/{test_embedding_intent_classifier.py => test_DIET_classifier.py} (92%)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index f4d1ea0b5c92..853291a8b6ee 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -148,7 +148,7 @@ expected outcome in the case of a tie. They look like this, where higher numbers
     | 4. ``FallbackPolicy`` and ``TwoStageFallbackPolicy``
     | 3. ``MemoizationPolicy`` and ``AugmentedMemoizationPolicy``
     | 2. ``MappingPolicy``
-    | 1. ``EmbeddingPolicy``, ``KerasPolicy``, and ``SklearnPolicy``
+    | 1. ``TEDPolicy``, ``KerasPolicy``, and ``SklearnPolicy``
 
 This priority hierarchy ensures that, for example, if there is an intent with a mapped action, but the NLU confidence is not
 above the ``nlu_threshold``, the bot will still fall back. In general, it is not recommended to have more
@@ -192,7 +192,7 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 
 .. _embedding_policy:
 
-Embedding Policy
+TED Policy
 ^^^^^^^^^^^^^^^^
 
 Transformer Embedding Dialogue Policy (TEDP)
@@ -223,11 +223,11 @@ It is recommended to use
 **Configuration:**
 
     Configuration parameters can be passed as parameters to the
-    ``EmbeddingPolicy`` within the policy configuration file.
+    ``TEDPolicy`` within the policy configuration file.
 
     .. warning::
 
-        Pass an appropriate number of ``epochs`` to the ``EmbeddingPolicy``,
+        Pass an appropriate number of ``epochs`` to the ``TEDPolicy``,
         otherwise the policy will be trained only for ``1``
         epoch.
 
@@ -345,7 +345,7 @@ It is recommended to use
     These parameters can be specified in the policy configuration file.
     The default values are defined in ``EmbeddingPolicy.defaults``:
 
-    .. literalinclude:: ../../rasa/core/policies/embedding_policy.py
+    .. literalinclude:: ../../rasa/core/policies/TED_policy.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index b88a44494354..78fb8d13bb81 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -147,7 +147,7 @@ ConveRTFeaturizer
     `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model.
 :Outputs:
     nothing, used as an input to intent classifiers and response selectors that need intent features and response
-    features respectively (e.g. ``EmbeddingIntentClassifier`` and ``ResponseSelector``)
+    features respectively (e.g. ``DIETClassifier`` and ``ResponseSelector``)
 :Requires: :ref:`ConveRTTokenizer`
 :Type: Dense featurizer
 :Description:
@@ -197,7 +197,7 @@ CountVectorsFeaturizer
 :Outputs:
    nothing, used as an input to intent classifiers that
    need bag-of-words representation of intent features
-   (e.g. ``EmbeddingIntentClassifier``)
+   (e.g. ``DIETClassifier``)
 :Requires: nothing
 :Type: Sparse featurizer
 :Description:
@@ -452,7 +452,7 @@ SklearnIntentClassifier
           # This is used with the ``C`` hyperparameter in GridSearchCV.
           kernels: ["linear"]
 
-EmbeddingIntentClassifier
+DIETClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :Short: Embedding intent classifier
@@ -565,9 +565,9 @@ EmbeddingIntentClassifier
               If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
 
     In the config, you can specify these parameters.
-    The default values are defined in ``EmbeddingIntentClassifier.defaults``:
+    The default values are defined in ``DIETClassifier.defaults``:
 
-    .. literalinclude:: ../../rasa/nlu/classifiers/embedding_intent_classifier.py
+    .. literalinclude:: ../../rasa/nlu/classifiers/DIET_classifier.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
@@ -653,7 +653,7 @@ Response Selector
     Response Selector component can be used to build a response retrieval model to directly predict a bot response from
     a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
     It embeds user inputs and response labels into the same space and follows the exact same
-    neural network architecture and optimization as the ``EmbeddingIntentClassifier``.
+    neural network architecture and optimization as the ``DIETClassifier``.
 
     The response selector needs to be preceded by a featurizer in the pipeline.
     This featurizer creates the features used for the embeddings.
@@ -666,7 +666,7 @@ Response Selector
 
 :Configuration:
 
-    The algorithm includes all the hyperparameters that ``EmbeddingIntentClassifier`` uses.
+    The algorithm includes all the hyperparameters that ``DIETClassifier`` uses.
     In addition, the component can also be configured to train a response selector for a particular retrieval intent
 
         - ``retrieval_intent``: sets the name of the intent for which this response selector model is trained. Default ``None``
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 04510136968c..4e516c6bf14b 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -332,7 +332,7 @@ def test_train_with_shuffle_false(
         policy.train(trackers, domain=default_domain)
 
 
-class TestEmbeddingPolicy(PolicyTestCollection):
+class TestTEDPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(featurizer=featurizer, priority=priority)
         return p
@@ -393,7 +393,7 @@ async def test_gen_batch(self, trained_policy, default_domain):
         )
 
 
-class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):
+class TestTEDPolicyMargin(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(
             featurizer=featurizer, priority=priority, **{"loss_type": "margin"}
@@ -413,7 +413,7 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
         mock.normalize.assert_not_called()
 
 
-class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
+class TestTEDPolicyWithEval(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(
             featurizer=featurizer,
@@ -423,7 +423,7 @@ def create_policy(self, featurizer, priority):
         return p
 
 
-class TestEmbeddingPolicyNoNormalization(TestEmbeddingPolicy):
+class TestTEDPolicyNoNormalization(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(featurizer=featurizer, priority=priority, **{"ranking_length": 0})
         return p
@@ -447,7 +447,7 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
         mock.normalize.assert_not_called()
 
 
-class TestEmbeddingPolicyLowRankingLength(TestEmbeddingPolicy):
+class TestTEDPolicyLowRankingLength(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(featurizer=featurizer, priority=priority, **{"ranking_length": 3})
         return p
@@ -456,7 +456,7 @@ def test_ranking_length(self, trained_policy):
         assert trained_policy.ranking_length == 3
 
 
-class TestEmbeddingPolicyHighRankingLength(TestEmbeddingPolicy):
+class TestTEDPolicyHighRankingLength(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(
             featurizer=featurizer, priority=priority, **{"ranking_length": 11}
@@ -467,9 +467,9 @@ def test_ranking_length(self, trained_policy):
         assert trained_policy.ranking_length == 11
 
 
-class TestEmbeddingPolicyWithFullDialogue(TestEmbeddingPolicy):
+class TestTEDPolicyWithFullDialogue(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
+        # use standard featurizer from TEDPolicy,
         # since it is using FullDialogueTrackerFeaturizer
         # if max_history is not specified
         p = TEDPolicy(priority=priority)
@@ -489,9 +489,9 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestEmbeddingPolicyWithMaxHistory(TestEmbeddingPolicy):
+class TestTEDPolicyWithMaxHistory(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
+        # use standard featurizer from TEDPolicy,
         # since it is using MaxHistoryTrackerFeaturizer
         # if max_history is specified
         p = TEDPolicy(priority=priority, max_history=self.max_history)
@@ -513,7 +513,7 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestEmbeddingPolicyWithTfConfig(TestEmbeddingPolicy):
+class TestTEDPolicyWithTfConfig(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
         return p
diff --git a/tests/nlu/base/test_classifiers.py b/tests/nlu/base/test_classifiers.py
index b0248f19879d..45fcc27fd152 100644
--- a/tests/nlu/base/test_classifiers.py
+++ b/tests/nlu/base/test_classifiers.py
@@ -5,7 +5,6 @@
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 
 # TODO: add tests for other classifers
-# from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 # from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
 # from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
 from rasa.nlu.training_data.formats.rasa import RasaReader
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_DIET_classifier.py
similarity index 92%
rename from tests/nlu/classifiers/test_embedding_intent_classifier.py
rename to tests/nlu/classifiers/test_DIET_classifier.py
index 51ae5c666b68..baae8f4dfa58 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_DIET_classifier.py
@@ -85,7 +85,7 @@ async def test_train(component_builder, tmpdir):
         {"name": "ConveRTTokenizer"},
         {"name": "CountVectorsFeaturizer"},
         {"name": "ConveRTFeaturizer"},
-        {"name": "EmbeddingIntentClassifier"},
+        {"name": "DIETClassifier"},
     ]
 
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
@@ -110,10 +110,7 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
 
     _config = RasaNLUModelConfig(
         {
-            "pipeline": [
-                {"name": "WhitespaceTokenizer"},
-                {"name": "EmbeddingIntentClassifier"},
-            ],
+            "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "DIETClassifier"}],
             "language": "en",
         }
     )
@@ -127,7 +124,7 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
         )
 
     assert (
-        "Failed to validate component 'EmbeddingIntentClassifier'. Missing one of "
+        "Failed to validate component 'DIETClassifier'. Missing one of "
         "the following properties: " in str(e.value)
     )
 
@@ -175,9 +172,9 @@ async def test_softmax_normalization(
     output_should_sum_to_1,
 ):
     pipeline = as_pipeline(
-        "WhitespaceTokenizer", "CountVectorsFeaturizer", "EmbeddingIntentClassifier"
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
     )
-    assert pipeline[2]["name"] == "EmbeddingIntentClassifier"
+    assert pipeline[2]["name"] == "DIETClassifier"
     pipeline[2].update(classifier_params)
 
     _config = RasaNLUModelConfig({"pipeline": pipeline})
@@ -212,9 +209,9 @@ async def test_margin_loss_is_not_normalized(
     monkeypatch, component_builder, tmpdir, classifier_params, output_length
 ):
     pipeline = as_pipeline(
-        "WhitespaceTokenizer", "CountVectorsFeaturizer", "EmbeddingIntentClassifier"
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
     )
-    assert pipeline[2]["name"] == "EmbeddingIntentClassifier"
+    assert pipeline[2]["name"] == "DIETClassifier"
     pipeline[2].update(classifier_params)
 
     mock = Mock()
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 04a889a6b2ae..6152b3a0eb63 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -44,7 +44,7 @@ def pipelines_for_tests():
                 "EntitySynonymMapper",
                 "SklearnIntentClassifier",
                 "MitieIntentClassifier",
-                "EmbeddingIntentClassifier",
+                "DIETClassifier",
                 "KeywordIntentClassifier",
                 "ResponseSelector",
             ),

From ff97270aababd98af2bd4f77cf20adef37b43a9d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 27 Jan 2020 17:47:46 +0100
Subject: [PATCH 218/633] renaming

---
 data/test_config/embedding_random_seed.yaml |  2 +-
 rasa/core/policies/TED_policy.py            |  4 +---
 rasa/nlu/classifiers/DIET_classifier.py     |  7 +------
 rasa/nlu/registry.py                        |  6 +++---
 rasa/utils/tensorflow/tf_models.py          | 10 +++++++---
 5 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/data/test_config/embedding_random_seed.yaml b/data/test_config/embedding_random_seed.yaml
index c2bd5bb86918..52748baddc77 100644
--- a/data/test_config/embedding_random_seed.yaml
+++ b/data/test_config/embedding_random_seed.yaml
@@ -1,3 +1,3 @@
 policies:
-- name: EmbeddingPolicy
+- name: TEDPolicy
   random_seed: 42
diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index 6654792dc3e8..5d83841ba36a 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -289,7 +289,6 @@ def train(
             self.config[EVAL_NUM_EXAMPLES],
             self.config[EVAL_NUM_EPOCHS],
             batch_strategy=self.config[BATCH_STRATEGY],
-            random_seed=self.config[RANDOM_SEED],
         )
 
     def continue_training(
@@ -316,7 +315,6 @@ def continue_training(
             self.config[EVAL_NUM_EXAMPLES],
             self.config[EVAL_NUM_EPOCHS],
             batch_strategy=self.config[BATCH_STRATEGY],
-            random_seed=self.config[RANDOM_SEED],
         )
 
     def predict_action_probabilities(
@@ -437,7 +435,7 @@ def __init__(
         max_history_tracker_featurizer_used: bool,
         label_data: RasaModelData,
     ):
-        super().__init__()
+        super().__init__(name="TED", random_seed=config[RANDOM_SEED])
 
         self.config = config
         self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index f272e05130c7..71bcbb1c0329 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -552,10 +552,6 @@ def train(
         # keep one example for persisting and loading
         self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
-        # TODO set it in the model
-        # set random seed
-        tf.random.set_seed(self.component_config[RANDOM_SEED])
-
         self.model = DIET(
             model_data.get_signature(),
             self._label_data,
@@ -570,7 +566,6 @@ def train(
             self.component_config[EVAL_NUM_EXAMPLES],
             self.component_config[EVAL_NUM_EPOCHS],
             batch_strategy=self.component_config[BATCH_STRATEGY],
-            random_seed=self.component_config[RANDOM_SEED],
         )
 
     # process helpers
@@ -825,7 +820,7 @@ def __init__(
         inverted_tag_dict: Dict[int, Text],
         config: Dict[Text, Any],
     ) -> None:
-        super().__init__(name="DIET")
+        super().__init__(name="DIET", random_seed=config[RANDOM_SEED])
 
         # data
         self.data_signature = data_signature
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 3a4cab41bb8c..9a8efdb0cd82 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -105,7 +105,7 @@
     "intent_classifier_sklearn": "SklearnIntentClassifier",
     "intent_classifier_mitie": "MitieIntentClassifier",
     "intent_classifier_keyword": "KeywordIntentClassifier",
-    "intent_classifier_tensorflow_embedding": "EmbeddingIntentClassifier",
+    "intent_classifier_tensorflow_embedding": "DIETClassifier",
 }
 
 # To simplify usage, there are a couple of model templates, that already add
@@ -134,12 +134,12 @@
             "min_ngram": 1,
             "max_ngram": 4,
         },
-        {"name": "EmbeddingIntentClassifier"},
+        {"name": "DIETClassifier"},
     ],
     "pretrained_embeddings_convert": [
         {"name": "ConveRTTokenizer"},
         {"name": "ConveRTFeaturizer"},
-        {"name": "EmbeddingIntentClassifier"},
+        {"name": "DIETClassifier"},
     ],
 }
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index d607cda75685..ba207fe9e23d 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -17,7 +17,7 @@ class RasaModel(tf.keras.models.Model):
     Cannot be used as tf.keras.Model
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, random_seed: Optional[int], *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
@@ -27,6 +27,11 @@ def __init__(self, *args, **kwargs):
 
         self._predict_function = None
 
+        self.random_seed = random_seed
+
+        tf.random.set_seed(random_seed)
+        np.random.seed(random_seed)
+
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
@@ -47,7 +52,6 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         eager: bool = False,
-        random_seed: Optional[int] = None,
     ) -> None:
         """Fit model data"""
 
@@ -62,7 +66,7 @@ def fit(
                 )
 
             model_data, evaluation_model_data = model_data.split(
-                evaluate_on_num_examples, random_seed
+                evaluate_on_num_examples, self.random_seed
             )
 
         (

From dfafa617a8219ef06dfcbe8ab72fa4b1f76a9fd9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 09:08:55 +0100
Subject: [PATCH 219/633] use defaultdict to get rid of pytype:key-error

---
 rasa/core/policies/TED_policy.py        |  9 +++---
 rasa/nlu/classifiers/DIET_classifier.py | 39 ++++++-------------------
 tests/core/test_policies.py             |  8 ++---
 3 files changed, 18 insertions(+), 38 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index 5d83841ba36a..bac63d9b0aa3 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import pickle
+from collections import defaultdict
 
 import numpy as np
 import tensorflow as tf
@@ -347,7 +348,7 @@ def persist(self, path: Text):
         if self.model is None:
             return
 
-        file_name = "embedding_policy"
+        file_name = "TED_policy"
         tf_model_file = os.path.join(path, f"{file_name}.tf_model")
 
         rasa.utils.io.create_directory_for_file(tf_model_file)
@@ -379,11 +380,11 @@ def load(cls, path: Text) -> "TEDPolicy":
 
         if not os.path.exists(path):
             raise Exception(
-                f"Failed to load embedding policy model. Path "
+                f"Failed to load TED policy model. Path "
                 f"'{os.path.abspath(path)}' doesn't exist."
             )
 
-        file_name = "embedding_policy"
+        file_name = "TED_policy"
         tf_model_file = os.path.join(path, f"{file_name}.tf_model")
 
         featurizer = TrackerFeaturizer.load(path)
@@ -462,7 +463,7 @@ def __init__(
         self.metrics_to_log += ["loss", "acc"]
 
         # set up tf layers
-        self._tf_layers = {}
+        self._tf_layers = defaultdict()
         self._prepare_layers()
 
     def _prepare_layers(self) -> None:
diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index 71bcbb1c0329..399a23209fa2 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -1,4 +1,6 @@
 import logging
+from collections import defaultdict
+
 import numpy as np
 import os
 import pickle
@@ -837,6 +839,7 @@ def __init__(
         self.config = config
 
         # tf objects
+        self._tf_layers = defaultdict()
         self._prepare_layers()
 
         # tf training
@@ -866,7 +869,6 @@ def _update_metrics_to_log(self) -> None:
             self.metrics_to_log += ["e_loss", "e_f1"]
 
     def _prepare_layers(self) -> None:
-        self._tf_layers = {}
         self._prepare_sequence_layers()
         self._prepare_mask_lm_layers()
         self._prepare_intent_classification_layers()
@@ -1012,15 +1014,10 @@ def _combine_sparse_dense_features(
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 if sparse_dropout:
-                    _f = self._tf_layers["sparse_dropout"](
-                        f, self._training
-                    )  # pytype: disable=key-error
+                    _f = self._tf_layers["sparse_dropout"](f, self._training)
                 else:
                     _f = f
-
-                dense_features.append(
-                    self._tf_layers[f"sparse_to_dense.{name}"](_f)
-                )  # pytype: disable=key-error
+                dense_features.append(self._tf_layers[f"sparse_to_dense.{name}"](_f))
             else:
                 dense_features.append(f)
 
@@ -1049,15 +1046,11 @@ def _create_sequence(
         )
 
         if masked_lm_loss:
-            pre, lm_mask_bool = self._tf_layers["input_mask"](
-                x, mask, self._training
-            )  # pytype: disable=key-error
+            pre, lm_mask_bool = self._tf_layers["input_mask"](x, mask, self._training)
         else:
             pre, lm_mask_bool = (x, None)
 
-        transformed = self._tf_layers["transformer"](
-            pre, 1 - mask, self._training
-        )  # pytype: disable=key-error
+        transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
         transformed = tf.nn.relu(transformed)
 
         return transformed, x, lm_mask_bool
@@ -1068,9 +1061,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
             self.tf_label_data["label_mask"][0],
             "label",
         )
-        all_labels_embed = self._tf_layers["embed.label"](
-            all_labels
-        )  # pytype: disable=key-error
+        all_labels_embed = self._tf_layers["embed.label"](all_labels)
 
         return all_labels, all_labels_embed
 
@@ -1094,26 +1085,22 @@ def _mask_loss(
         a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
         a_masked = tf.boolean_mask(a, lm_mask_bool)
 
-        # pytype: disable=key-error
         a_t_masked_embed = self._tf_layers["embed.lm_mask"](a_t_masked)
         a_masked_embed = self._tf_layers["embed.golden_token"](a_masked)
 
         return self._tf_layers["loss.mask"](
             a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
         )
-        # pytype: enable=key-error
 
     def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
         all_labels, all_labels_embed = self._create_all_labels()
 
-        # pytype: disable=key-error
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
 
         return self._tf_layers["loss.label"](
             a_embed, b_embed, b, all_labels_embed, all_labels
         )
-        # pytype: enable=key-error
 
     def _entity_loss(
         self, a: tf.Tensor, c: tf.Tensor, mask: tf.Tensor, sequence_lengths
@@ -1123,12 +1110,10 @@ def _entity_loss(
         sequence_lengths = sequence_lengths - 1
         c = tf.cast(c[:, :, 0], tf.int32)
 
-        # pytype: disable=key-error
         logits = self._tf_layers["embed.logits"](a)
 
         loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
-        # pytype: enable=key-error
 
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
@@ -1140,9 +1125,7 @@ def _entity_loss(
         c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
         pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
 
-        f1 = self._tf_layers["crf_f1_score"](
-            c_masked_1, pred_ids_masked_1
-        )  # pytype: disable=key-error
+        f1 = self._tf_layers["crf_f1_score"](c_masked_1, pred_ids_masked_1)
 
         return loss, f1
 
@@ -1211,7 +1194,6 @@ def batch_predict(
 
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
-            # pytype: disable=key-error
             cls_embed = self._tf_layers["embed.text"](cls)
 
             sim_all = self._tf_layers["loss.label"].sim(
@@ -1220,14 +1202,11 @@ def batch_predict(
             scores = self._tf_layers["loss.label"].confidence_from_sim(
                 sim_all, self.config[SIMILARITY_TYPE]
             )
-            # pytype: enable=key-error
             out["i_scores"] = scores
 
         if self.config[ENTITY_RECOGNITION]:
-            # pytype: disable=key-error
             logits = self._tf_layers["embed.logits"](text_transformed)
             pred_ids = self._tf_layers["crf"](logits, sequence_lengths - 1)
-            # pytype: enable=key-error
             out["e_ids"] = pred_ids
 
         return out
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 4e516c6bf14b..092f797c9966 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -32,7 +32,7 @@
 from rasa.core.policies.memoization import AugmentedMemoizationPolicy, MemoizationPolicy
 from rasa.core.policies.sklearn_policy import SklearnPolicy
 from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.tensorflow.constants import SIMILARITY_TYPE
+from rasa.utils.tensorflow.constants import SIMILARITY_TYPE, RANKING_LENGTH
 from rasa.utils import train_utils
 from tests.core.conftest import (
     DEFAULT_DOMAIN_PATH_WITH_MAPPING,
@@ -341,7 +341,7 @@ def test_similarity_type(self, trained_policy):
         assert trained_policy.config[SIMILARITY_TYPE] == "inner"
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 10
+        assert trained_policy.config[RANKING_LENGTH] == 10
 
     def test_normalization(self, trained_policy, tracker, default_domain, monkeypatch):
         # first check the output is what we expect
@@ -351,7 +351,7 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
         # count number of non-zero confidences
         assert (
             sum([confidence > 0 for confidence in predicted_probabilities])
-            == trained_policy.ranking_length
+            == trained_policy.config[RANKING_LENGTH]
         )
         # check that the norm is still 1
         assert sum(predicted_probabilities) == pytest.approx(1)
@@ -368,7 +368,7 @@ async def test_gen_batch(self, trained_policy, default_domain):
         training_data = trained_policy.featurize_for_training(
             training_trackers, default_domain
         )
-        model_data = trained_policy._create_modeldata(training_data.X, training_data.y)
+        model_data = trained_policy._create_model_data(training_data.X, training_data.y)
         batch_size = 2
         batch_x, batch_y, _ = next(
             model_data.gen_batch(batch_size=batch_size, label_key="label_ids")

From 57a9aa3558864a28376834d8b1b6b61afe3856c7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 10:09:48 +0100
Subject: [PATCH 220/633] defaultdict not working with TF

---
 rasa/core/policies/TED_policy.py        | 9 +++++++--
 rasa/nlu/classifiers/DIET_classifier.py | 9 +++++++--
 rasa/utils/tensorflow/tf_layers.py      | 3 ---
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index bac63d9b0aa3..a29c249da22b 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -2,7 +2,6 @@
 import logging
 import os
 import pickle
-from collections import defaultdict
 
 import numpy as np
 import tensorflow as tf
@@ -428,6 +427,9 @@ def load(cls, path: Text) -> "TEDPolicy":
         )
 
 
+# pytype: disable=key-error
+
+
 class TED(RasaModel):
     def __init__(
         self,
@@ -463,7 +465,7 @@ def __init__(
         self.metrics_to_log += ["loss", "acc"]
 
         # set up tf layers
-        self._tf_layers = defaultdict()
+        self._tf_layers = {}
         self._prepare_layers()
 
     def _prepare_layers(self) -> None:
@@ -592,3 +594,6 @@ def batch_predict(
         )
 
         return {"action_scores": scores}
+
+
+# pytype: enable=key-error
diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index 399a23209fa2..05261613ce62 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -1,5 +1,4 @@
 import logging
-from collections import defaultdict
 
 import numpy as np
 import os
@@ -814,6 +813,9 @@ def load(
         )
 
 
+# pytype: disable=key-error
+
+
 class DIET(RasaModel):
     def __init__(
         self,
@@ -839,7 +841,7 @@ def __init__(
         self.config = config
 
         # tf objects
-        self._tf_layers = defaultdict()
+        self._tf_layers = {}
         self._prepare_layers()
 
         # tf training
@@ -1210,3 +1212,6 @@ def batch_predict(
             out["e_ids"] = pred_ids
 
         return out
+
+
+# pytype: enable=key-error
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 671549a488db..8f0be36431a4 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -256,9 +256,6 @@ def __init__(
         ]
 
     def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
-
-        tf.print(training)
-
         x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
         attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
         attn_out = self._dropout(attn_out, training=training)

From f5a2d727eea7bcefdfe42c7237548d78bf13481f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 28 Jan 2020 10:24:34 +0100
Subject: [PATCH 221/633] remove print

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 ++
 rasa/utils/tensorflow/tf_layers.py                  | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fa9bb843ff4c..539c5a2bb0a1 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1059,6 +1059,8 @@ def _create_sequence(
         else:
             pre, lm_mask_bool = (x, None)
 
+        last = mask * tf.math.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
+        pre = tf.concat([pre, last], -1)
         transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
         transformed = tf.nn.relu(transformed)
 
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 671549a488db..7fb7bcc8d47d 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -257,8 +257,6 @@ def __init__(
 
     def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
 
-        tf.print(training)
-
         x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
         attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
         attn_out = self._dropout(attn_out, training=training)

From 6eacd5d4c494197644c238a18ad32df774b44568 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 28 Jan 2020 10:28:00 +0100
Subject: [PATCH 222/633] remove experimental lines

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 539c5a2bb0a1..fa9bb843ff4c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1059,8 +1059,6 @@ def _create_sequence(
         else:
             pre, lm_mask_bool = (x, None)
 
-        last = mask * tf.math.cumprod(1 - mask, axis=1, exclusive=True, reverse=True)
-        pre = tf.concat([pre, last], -1)
         transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
         transformed = tf.nn.relu(transformed)
 

From f3f864457579704af5698e8d70e05862c0b248e2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 10:41:32 +0100
Subject: [PATCH 223/633] update test_policies

---
 rasa/core/policies/TED_policy.py | 15 ++++---
 tests/core/test_policies.py      | 69 ++++++++++++++++----------------
 2 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index a29c249da22b..659cfdae14e3 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -92,7 +92,7 @@ class TEDPolicy(Policy):
         # number of epochs
         EPOCHS: 1,
         # set random seed to any int to get reproducible results
-        RANDOM_SEED: None,
+        RANDOM_SEED: 42,
         # embedding parameters
         # dimension size of embedding vectors
         EMBED_DIM: 20,
@@ -337,9 +337,10 @@ def predict_action_probabilities(
         confidence = confidence[0, -1, :]
 
         if self.config[LOSS_TYPE] == "softmax" and self.config[RANKING_LENGTH] > 0:
+            print("DOING NORMALIZATION")
             confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
 
-        return list(confidence)
+        return confidence.tolist()
 
     def persist(self, path: Text):
         """Persists the policy to a storage."""
@@ -388,6 +389,9 @@ def load(cls, path: Text) -> "TEDPolicy":
 
         featurizer = TrackerFeaturizer.load(path)
 
+        if not os.path.exists(os.path.join(path, file_name + ".data_example.pkl")):
+            return cls(featurizer=featurizer)
+
         with open(os.path.join(path, file_name + ".data_example.pkl"), "rb") as f:
             model_data_example = RasaModelData(
                 label_key="label_ids", data=pickle.load(f)
@@ -419,12 +423,7 @@ def load(cls, path: Text) -> "TEDPolicy":
         )
         model.build_for_predict(predict_data_example)
 
-        return cls(
-            featurizer=featurizer,
-            component_config=meta,
-            priority=meta["priority"],
-            model=model,
-        )
+        return cls(featurizer=featurizer, component_config=meta, model=model, **meta)
 
 
 # pytype: disable=key-error
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 092f797c9966..549b25d41e86 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -32,7 +32,14 @@
 from rasa.core.policies.memoization import AugmentedMemoizationPolicy, MemoizationPolicy
 from rasa.core.policies.sklearn_policy import SklearnPolicy
 from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.tensorflow.constants import SIMILARITY_TYPE, RANKING_LENGTH
+from rasa.utils.tensorflow.constants import (
+    SIMILARITY_TYPE,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SCALE_LOSS,
+    EVAL_NUM_EXAMPLES,
+    EPOCHS,
+)
 from rasa.utils import train_utils
 from tests.core.conftest import (
     DEFAULT_DOMAIN_PATH_WITH_MAPPING,
@@ -138,7 +145,7 @@ def test_featurizer(self, trained_policy, tmpdir):
     async def test_continue_training(self, trained_policy, default_domain):
         training_trackers = await train_trackers(default_domain, augmentation_factor=0)
         trained_policy.continue_training(
-            training_trackers, default_domain, **{"epochs": 1}
+            training_trackers, default_domain, **{EPOCHS: 1}
         )
 
     async def test_persist_and_load(self, trained_policy, default_domain, tmpdir):
@@ -370,20 +377,15 @@ async def test_gen_batch(self, trained_policy, default_domain):
         )
         model_data = trained_policy._create_model_data(training_data.X, training_data.y)
         batch_size = 2
-        batch_x, batch_y, _ = next(
-            model_data.gen_batch(batch_size=batch_size, label_key="label_ids")
-        )
+        batch_x, batch_y, _ = next(model_data._gen_batch(batch_size=batch_size))
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
             batch_x[0].shape == model_data.get("dialogue_features")[0][0].shape
             and batch_y[0].shape == model_data.get("label_features")[0][0].shape
         )
         batch_x, batch_y, _ = next(
-            model_data.gen_batch(
-                batch_size=batch_size,
-                label_key="label_ids",
-                batch_strategy="balanced",
-                shuffle=True,
+            model_data._gen_batch(
+                batch_size=batch_size, batch_strategy="balanced", shuffle=True
             )
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
@@ -395,13 +397,11 @@ async def test_gen_batch(self, trained_policy, default_domain):
 
 class TestTEDPolicyMargin(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = TEDPolicy(
-            featurizer=featurizer, priority=priority, **{"loss_type": "margin"}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{LOSS_TYPE: "margin"})
         return p
 
     def test_similarity_type(self, trained_policy):
-        assert trained_policy.similarity_type == "cosine"
+        assert trained_policy.config[SIMILARITY_TYPE] == "cosine"
 
     def test_normalization(self, trained_policy, tracker, default_domain, monkeypatch):
         # Mock actual normalization method
@@ -418,18 +418,18 @@ def create_policy(self, featurizer, priority):
         p = TEDPolicy(
             featurizer=featurizer,
             priority=priority,
-            **{"scale_loss": False, "evaluate_on_num_examples": 4},
+            **{SCALE_LOSS: False, EVAL_NUM_EXAMPLES: 4},
         )
         return p
 
 
 class TestTEDPolicyNoNormalization(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = TEDPolicy(featurizer=featurizer, priority=priority, **{"ranking_length": 0})
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{RANKING_LENGTH: 0})
         return p
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 0
+        assert trained_policy.config[RANKING_LENGTH] == 0
 
     def test_normalization(self, trained_policy, tracker, default_domain, monkeypatch):
         # first check the output is what we expect
@@ -449,22 +449,20 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
 
 class TestTEDPolicyLowRankingLength(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = TEDPolicy(featurizer=featurizer, priority=priority, **{"ranking_length": 3})
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{RANKING_LENGTH: 3})
         return p
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 3
+        assert trained_policy.config[RANKING_LENGTH] == 3
 
 
 class TestTEDPolicyHighRankingLength(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = TEDPolicy(
-            featurizer=featurizer, priority=priority, **{"ranking_length": 11}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{RANKING_LENGTH: 11})
         return p
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 11
+        assert trained_policy.config[RANKING_LENGTH] == 11
 
 
 class TestTEDPolicyWithFullDialogue(TestTEDPolicy):
@@ -513,18 +511,19 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestTEDPolicyWithTfConfig(TestTEDPolicy):
-    def create_policy(self, featurizer, priority):
-        p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
-        return p
-
-    def test_tf_config(self, trained_policy, tmpdir):
-        # noinspection PyProtectedMember
-        assert trained_policy.session._config == session_config()
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
-        # noinspection PyProtectedMember
-        assert loaded.session._config == session_config()
+# TODO test tf config
+# class TestTEDPolicyWithTfConfig(TestTEDPolicy):
+#     def create_policy(self, featurizer, priority):
+#         p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
+#         return p
+#
+#     def test_tf_config(self, trained_policy, tmpdir):
+#         # noinspection PyProtectedMember
+#         assert trained_policy.session._config == session_config()
+#         trained_policy.persist(tmpdir.strpath)
+#         loaded = trained_policy.__class__.load(tmpdir.strpath)
+#         # noinspection PyProtectedMember
+#         assert loaded.session._config == session_config()
 
 
 class TestMemoizationPolicy(PolicyTestCollection):

From 8c88d736e20f62bec79af39642395a971cb71e0f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 10:43:59 +0100
Subject: [PATCH 224/633] Update examples

---
 examples/formbot/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 3aa0e7577759..f9eb4ff0ff7b 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -6,7 +6,7 @@ pipeline:
   - name: EntitySynonymMapper
   - name: CountVectorsFeaturizer
     token_pattern: (?u)\b\w+\b
-  - name: EmbeddingIntentClassifier
+  - name: DIETClassifier
   - name: DucklingHTTPExtractor
     url: http://localhost:8000
     dimensions:

From 1890a3e45c6f80e0549c0893134e63cabf5b2671 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 28 Jan 2020 10:47:56 +0100
Subject: [PATCH 225/633] remove unnecessary complexity from input mask, change
 defaults

---
 rasa/nlu/classifiers/embedding_intent_classifier.py |  4 ++--
 rasa/utils/tensorflow/tf_layers.py                  | 10 ++--------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fa9bb843ff4c..707b387d0c03 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -162,7 +162,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: True,
+        UNIDIRECTIONAL_ENCODER: False,
         # visualization of accuracy
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -174,7 +174,7 @@ class EmbeddingIntentClassifier(EntityExtractor):
         # if true named entity recognition is trained and entities predicted
         ENTITY_RECOGNITION: True,
         MASKED_LM: False,
-        SPARSE_INPUT_DROPOUT: False,
+        SPARSE_INPUT_DROPOUT: True,
     }
     # end default properties (DOC MARKER - don't remove)
 
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 7fb7bcc8d47d..04d880fb2e91 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -376,20 +376,14 @@ def call(
         lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
 
         def x_masked():
-            # do not substitute with cls token
-            pad_mask_up_to_last = tf.math.cumprod(
-                1 - mask, axis=1, exclusive=True, reverse=True
-            )
-            mask_up_to_last = 1 - pad_mask_up_to_last
-
             x_random_pad = (
                 tf.random.uniform(
                     tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype
                 )
-                * pad_mask_up_to_last
+                * (1 - mask)
             )
             # shuffle over batch dim
-            x_shuffle = tf.random.shuffle(x * mask_up_to_last + x_random_pad)
+            x_shuffle = tf.random.shuffle(x * mask + x_random_pad)
 
             # shuffle over sequence dim
             x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])

From 22899806d0f5f47798aeb951c500fa5738f61793 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 11:06:02 +0100
Subject: [PATCH 226/633] update tests for DIET

---
 rasa/core/policies/TED_policy.py              |  1 -
 tests/nlu/classifiers/test_DIET_classifier.py | 46 ++++++++++++-------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index 659cfdae14e3..c719a5280cb7 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -337,7 +337,6 @@ def predict_action_probabilities(
         confidence = confidence[0, -1, :]
 
         if self.config[LOSS_TYPE] == "softmax" and self.config[RANKING_LENGTH] > 0:
-            print("DOING NORMALIZATION")
             confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
 
         return confidence.tolist()
diff --git a/tests/nlu/classifiers/test_DIET_classifier.py b/tests/nlu/classifiers/test_DIET_classifier.py
index baae8f4dfa58..05b4b257ace1 100644
--- a/tests/nlu/classifiers/test_DIET_classifier.py
+++ b/tests/nlu/classifiers/test_DIET_classifier.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-import scipy.sparse
 
 from unittest.mock import Mock
 
@@ -13,6 +12,7 @@
     DENSE_FEATURE_NAMES,
     INTENT_ATTRIBUTE,
 )
+from rasa.utils.tensorflow.constants import LOSS_TYPE, RANDOM_SEED, RANKING_LENGTH
 from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
 from rasa.nlu.model import Interpreter
 from rasa.nlu.training_data import Message
@@ -80,29 +80,43 @@ def test_check_labels_features_exist(messages, expected):
     assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected
 
 
-async def test_train(component_builder, tmpdir):
-    pipeline = [
-        {"name": "ConveRTTokenizer"},
-        {"name": "CountVectorsFeaturizer"},
-        {"name": "ConveRTFeaturizer"},
-        {"name": "DIETClassifier"},
-    ]
+@pytest.mark.parametrize(
+    "pipeline",
+    [
+        [
+            {"name": "ConveRTTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "ConveRTFeaturizer"},
+            {"name": "DIETClassifier"},
+        ],
+        [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "DIETClassifier", LOSS_TYPE: "margin"},
+        ],
+    ],
+)
+async def test_train_persist_load(pipeline, component_builder, tmpdir):
 
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
 
-    (trained, _, persisted_path) = await train(
+    (trainer, trained, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
 
+    assert trainer.pipeline
     assert trained.pipeline
 
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
+    assert loaded.parse("hello") == trained.parse("hello")
+    assert loaded.parse("Hello today is Monday, again!") == trained.parse(
+        "Hello today is Monday, again!"
+    )
 
 
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
@@ -138,25 +152,25 @@ def as_pipeline(*components):
     [
         ({"random_seed": 42}, "data/test/many_intents.md", 10, True),  # default config
         (
-            {"random_seed": 42, "ranking_length": 0},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 0},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # no normalization
         (
-            {"random_seed": 42, "ranking_length": 3},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 3},
             "data/test/many_intents.md",
             3,
             True,
         ),  # lower than default ranking_length
         (
-            {"random_seed": 42, "ranking_length": 12},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 12},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # higher than default ranking_length
         (
-            {"random_seed": 42},
+            {RANDOM_SEED: 42},
             "examples/moodbot/data/nlu.md",
             7,
             True,
@@ -203,7 +217,7 @@ async def test_softmax_normalization(
 
 @pytest.mark.parametrize(
     "classifier_params, output_length",
-    [({"loss_type": "margin", "random_seed": 42}, LABEL_RANKING_LENGTH)],
+    [({LOSS_TYPE: "margin", RANDOM_SEED: 42}, LABEL_RANKING_LENGTH)],
 )
 async def test_margin_loss_is_not_normalized(
     monkeypatch, component_builder, tmpdir, classifier_params, output_length

From 414261005b88c1b1932195a1b91742cca791cd79 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 13:49:17 +0100
Subject: [PATCH 227/633] add test for DIET

---
 tests/nlu/classifiers/test_DIET_classifier.py | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tests/nlu/classifiers/test_DIET_classifier.py b/tests/nlu/classifiers/test_DIET_classifier.py
index 05b4b257ace1..c7986fea0dc7 100644
--- a/tests/nlu/classifiers/test_DIET_classifier.py
+++ b/tests/nlu/classifiers/test_DIET_classifier.py
@@ -119,6 +119,41 @@ async def test_train_persist_load(pipeline, component_builder, tmpdir):
     )
 
 
+@pytest.mark.parametrize(
+    "pipeline",
+    [
+        [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "DIETClassifier", RANDOM_SEED: 42},
+        ]
+    ],
+)
+async def test_train_multiple_runs(pipeline, component_builder, tmpdir):
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
+
+    trained_models = []
+
+    for _ in range(3):
+        (trainer, trained, persisted_path) = await train(
+            _config,
+            path=tmpdir.strpath,
+            data=DEFAULT_DATA_PATH,
+            component_builder=component_builder,
+        )
+
+        trained_models.append(trained)
+
+    result_1 = (trained_models[0]).parse("Hello again!")
+    result_2 = (trained_models[1]).parse("Hello again!")
+    result_3 = (trained_models[2]).parse("Hello again!")
+
+    assert result_1 == result_2
+    assert result_1 == result_3
+    assert result_2 == result_3
+
+
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
     from rasa.nlu import train
 

From 485d62e262baf7dc041cbaeb07d9d7f5fac49f17 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 14:05:05 +0100
Subject: [PATCH 228/633] Fix test_policies

---
 rasa/core/policies/TED_policy.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index c719a5280cb7..fe2d29b39bc5 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -157,16 +157,14 @@ def __init__(
 
         super().__init__(featurizer, priority)
 
-        self._load_params(**kwargs)
+        self._load_params(kwargs)
 
         self.model = model
 
         self._label_data = None
         self.data_example = None
 
-        self._tf_config = train_utils.load_tf_config(self.config)
-
-    def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
+    def _load_params(self, kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
         self.config.update(kwargs)
 
@@ -356,10 +354,8 @@ def persist(self, path: Text):
 
         self.model.save(tf_model_file)
 
-        with open(os.path.join(path, file_name + ".tf_config.pkl"), "wb") as f:
-            pickle.dump(self._tf_config, f)
-
-        self.config["priority"] = self.priority
+        with open(os.path.join(path, file_name + ".priority.pkl"), "wb") as f:
+            pickle.dump(self.priority, f)
 
         with open(os.path.join(path, file_name + ".meta.pkl"), "wb") as f:
             pickle.dump(self.config, f)
@@ -402,6 +398,9 @@ def load(cls, path: Text) -> "TEDPolicy":
         with open(os.path.join(path, file_name + ".meta.pkl"), "rb") as f:
             meta = pickle.load(f)
 
+        with open(os.path.join(path, file_name + ".priority.pkl"), "rb") as f:
+            priority = pickle.load(f)
+
         meta = train_utils.update_auto_similarity_type(meta)
 
         model = TED.load(
@@ -422,7 +421,7 @@ def load(cls, path: Text) -> "TEDPolicy":
         )
         model.build_for_predict(predict_data_example)
 
-        return cls(featurizer=featurizer, component_config=meta, model=model, **meta)
+        return cls(featurizer=featurizer, priority=priority, model=model, **meta)
 
 
 # pytype: disable=key-error

From 9af27da5df327eaeb495489b2401f3c3c53c5152 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 14:58:41 +0100
Subject: [PATCH 229/633] remove duplicated test

---
 tests/nlu/classifiers/test_DIET_classifier.py | 36 +------------------
 tests/nlu/training/test_train.py              |  1 +
 2 files changed, 2 insertions(+), 35 deletions(-)

diff --git a/tests/nlu/classifiers/test_DIET_classifier.py b/tests/nlu/classifiers/test_DIET_classifier.py
index c7986fea0dc7..64c4a66f5972 100644
--- a/tests/nlu/classifiers/test_DIET_classifier.py
+++ b/tests/nlu/classifiers/test_DIET_classifier.py
@@ -17,6 +17,7 @@
 from rasa.nlu.model import Interpreter
 from rasa.nlu.training_data import Message
 from rasa.utils import train_utils
+from tests.nlu import utilities
 from tests.nlu.conftest import DEFAULT_DATA_PATH
 
 
@@ -119,41 +120,6 @@ async def test_train_persist_load(pipeline, component_builder, tmpdir):
     )
 
 
-@pytest.mark.parametrize(
-    "pipeline",
-    [
-        [
-            {"name": "WhitespaceTokenizer"},
-            {"name": "CountVectorsFeaturizer"},
-            {"name": "DIETClassifier", RANDOM_SEED: 42},
-        ]
-    ],
-)
-async def test_train_multiple_runs(pipeline, component_builder, tmpdir):
-
-    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
-
-    trained_models = []
-
-    for _ in range(3):
-        (trainer, trained, persisted_path) = await train(
-            _config,
-            path=tmpdir.strpath,
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder,
-        )
-
-        trained_models.append(trained)
-
-    result_1 = (trained_models[0]).parse("Hello again!")
-    result_2 = (trained_models[1]).parse("Hello again!")
-    result_3 = (trained_models[2]).parse("Hello again!")
-
-    assert result_1 == result_2
-    assert result_1 == result_3
-    assert result_2 == result_3
-
-
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
     from rasa.nlu import train
 
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 6152b3a0eb63..d88436564e2e 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -99,6 +99,7 @@ async def test_random_seed(component_builder, tmpdir):
     _config = utilities.base_test_conf("supervised_embeddings")
     # set fixed random seed of the embedding intent classifier to 1
     _config.set_component_attr(6, random_seed=1)
+
     # first run
     (trained_a, _, persisted_path_a) = await train(
         _config,

From e1975de0d8306fc5f281dd215b689785d3a69993 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 15:11:42 +0100
Subject: [PATCH 230/633] update headlines in docs

---
 docs/core/policies.rst  | 2 +-
 docs/nlu/components.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 853291a8b6ee..d4985c6a3e86 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -193,7 +193,7 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 .. _embedding_policy:
 
 TED Policy
-^^^^^^^^^^^^^^^^
+^^^^^^^^^^
 
 Transformer Embedding Dialogue Policy (TEDP)
 
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 78fb8d13bb81..d3c82f8a18d4 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -453,7 +453,7 @@ SklearnIntentClassifier
           kernels: ["linear"]
 
 DIETClassifier
-~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~
 
 :Short: Embedding intent classifier
 :Outputs: ``intent`` and ``intent_ranking``

From db0eabf392bca02c593b1c94a0e9b77f0a581c9d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 28 Jan 2020 15:26:09 +0100
Subject: [PATCH 231/633] fix more tests.

---
 rasa/utils/tensorflow/tf_model_data.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index dc8412d30f17..e593b6694d3e 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -23,7 +23,10 @@ def __init__(self, label_key: Optional[Text] = None, data: Data = None):
         self.num_examples = self.get_number_of_examples()
 
     def get(self, key: Text) -> List[np.ndarray]:
-        return self.data[key]
+        if key in self.data:
+            return self.data[key]
+        else:
+            return []
 
     def items(self):
         return self.data.items()

From 218dc34685e2306a058ab59722b0f3ec69611d7c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 08:06:25 +0100
Subject: [PATCH 232/633] Update test configs.

---
 data/test/config_embedding_test.yml                         | 2 +-
 rasa/core/policies/TED_policy.py                            | 6 +++---
 .../config_embedding_intent_response_selector.yml           | 2 +-
 sample_configs/config_supervised_embeddings_duckling.yml    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/data/test/config_embedding_test.yml b/data/test/config_embedding_test.yml
index 379e1e2e3ef7..40f570d6627f 100644
--- a/data/test/config_embedding_test.yml
+++ b/data/test/config_embedding_test.yml
@@ -2,5 +2,5 @@ language: en
 pipeline:
 - name: "CountVectorsFeaturizer"
   max_ngram: 3
-- name: "EmbeddingIntentClassifier"
+- name: "DIETClassifier"
   epochs: 10
\ No newline at end of file
diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index fe2d29b39bc5..48655d42906c 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -148,7 +148,7 @@ def __init__(
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
         model: Optional[RasaModel] = None,
-        **kwargs: Any,
+        **kwargs: Dict[Text, Any],
     ) -> None:
         """Declare instant variables with default values"""
 
@@ -157,14 +157,14 @@ def __init__(
 
         super().__init__(featurizer, priority)
 
-        self._load_params(kwargs)
+        self._load_params(**kwargs)
 
         self.model = model
 
         self._label_data = None
         self.data_example = None
 
-    def _load_params(self, kwargs: Dict[Text, Any]) -> None:
+    def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
         self.config.update(kwargs)
 
diff --git a/sample_configs/config_embedding_intent_response_selector.yml b/sample_configs/config_embedding_intent_response_selector.yml
index 705cfca7ff51..b2be5582ade5 100644
--- a/sample_configs/config_embedding_intent_response_selector.yml
+++ b/sample_configs/config_embedding_intent_response_selector.yml
@@ -3,7 +3,7 @@ language: "en"
 pipeline:
   - name: "WhitespaceTokenizer"
   - name: "CountVectorsFeaturizer"
-  - name: "EmbeddingIntentClassifier"
+  - name: "DIETClassifier"
     epochs: 2
   - name: "ResponseSelector"
     epochs: 2
diff --git a/sample_configs/config_supervised_embeddings_duckling.yml b/sample_configs/config_supervised_embeddings_duckling.yml
index c1771ea5addc..8df9cb707711 100644
--- a/sample_configs/config_supervised_embeddings_duckling.yml
+++ b/sample_configs/config_supervised_embeddings_duckling.yml
@@ -2,6 +2,6 @@ language: "en"
 
 pipeline:
 - name: "CountVectorsFeaturizer"
-- name: "EmbeddingIntentClassifier"
+- name: "DIETClassifier"
 - name: "DucklingHTTPExtractor"
   url: "http://duckling:8000"

From 7cd41ef69a9b4f247faedf5d2be464e8684bd7af Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 12:02:36 +0100
Subject: [PATCH 233/633] fix model data tests

---
 rasa/nlu/classifiers/DIET_classifier.py |  8 ++++----
 tests/utils/test_tf_model_data.py       | 17 +++++++++++++----
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index 8bd5cac59232..16417c905b35 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -792,10 +792,10 @@ def load(
         model = DIET.load(
             tf_model_file,
             model_data_example,
-            model_data_example.get_signature(),
-            label_data,
-            inv_tag_dict,
-            meta,
+            data_signature=model_data_example.get_signature(),
+            label_data=label_data,
+            inverted_tag_dict=inv_tag_dict,
+            config=meta,
         )
         # build the graph for prediction
         predict_data_example = RasaModelData(
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index ed12c623b0a3..17f58ed787dc 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -1,3 +1,5 @@
+import copy
+
 import pytest
 import scipy.sparse
 import numpy as np
@@ -58,14 +60,21 @@ async def model_data() -> RasaModelData:
 
 
 def test_shuffle_session_data(model_data: RasaModelData):
-    before = model_data.values()
+    before = copy.copy(model_data)
+
+    # precondition
+    assert np.all(
+        np.array(list(before.values())) == np.array(list(model_data.values()))
+    )
 
     data = model_data.shuffled_data(model_data.data)
 
     # check that original data didn't change
-    assert np.array(before) == np.array(model_data.values())
+    assert np.all(
+        np.array(list(before.values())) == np.array(list(model_data.values()))
+    )
     # check that new data is different
-    assert np.array(model_data.values()) != np.array(data.values())
+    assert np.all(np.array(model_data.values()) != np.array(data.values()))
 
 
 def test_split_session_data_by_label(model_data: RasaModelData):
@@ -120,7 +129,7 @@ def test_get_number_of_examples(model_data: RasaModelData):
 
 
 def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
-    model_data.add_features("dense", [np.random.randint(5, size=(2, 10))])
+    model_data.data["dense"] = [np.random.randint(5, size=(2, 10))]
     with pytest.raises(ValueError):
         model_data.get_number_of_examples()
 

From edecd618c024a29e5750f3af8957bca103253e86 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 12:42:42 +0100
Subject: [PATCH 234/633] skip random seed tests for now

---
 tests/core/test_training.py      | 2 ++
 tests/nlu/training/test_train.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/core/test_training.py b/tests/core/test_training.py
index c2195f2991cd..2744dd5ac4b7 100644
--- a/tests/core/test_training.py
+++ b/tests/core/test_training.py
@@ -116,6 +116,8 @@ def configs_for_random_seed_test():
     ]
 
 
+# TODO fix random seed
+@pytest.mark.skip(reason="we need to fix the random seed first")
 @pytest.mark.parametrize("config_file", configs_for_random_seed_test())
 async def test_random_seed(tmpdir, config_file):
     # set random seed in config file to
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index d88436564e2e..5750d4940ba9 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -93,6 +93,8 @@ async def test_train_model(pipeline_template, component_builder, tmpdir):
     assert loaded.parse("Hello today is Monday, again!") is not None
 
 
+# TODO fix random seed
+@pytest.mark.skip(reason="we need to fix the random seed first")
 async def test_random_seed(component_builder, tmpdir):
     """test if train result is the same for two runs of tf embedding"""
 

From d58f36f7549c3509bae4a3d8e49ece6858e7eafc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 13:44:43 +0100
Subject: [PATCH 235/633] support different pooling operations

---
 .../dense_featurizer/convert_featurizer.py    |  2 +-
 .../dense_featurizer/mitie_featurizer.py      |  7 +++--
 .../dense_featurizer/spacy_featurizer.py      | 31 +++++++++++++++++--
 .../sparse_featurizer/regex_featurizer.py     |  5 ++-
 .../nlu/featurizers/test_spacy_featurizer.py  | 23 ++++++++++++++
 5 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 16a260351bb6..969e58353447 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -163,7 +163,7 @@ def _sequence_encoding_of_text(self, batch: List[Text]) -> np.ndarray:
     def train(
         self,
         training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 8fd4ad68a9db..a1b4f8cafbe1 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any, List, Text
+from typing import Any, List, Text, Optional
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
@@ -39,7 +39,10 @@ def get_tokens_by_attribute(self, example: Message, attribute: Text) -> Any:
         return example.get(TOKENS_NAMES[attribute])
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 07ba118bd65f..f74926a0b754 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any, Optional, Text
+from typing import Any, Optional, Text, Dict
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
@@ -28,6 +28,17 @@ class SpacyFeaturizer(Featurizer):
         SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
+    defaults = {
+        # Specify what pooling operation should be used to calculate the vector of
+        # the CLS token. Available options: 'mean' and 'max'
+        "pooling": "mean"
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+        super().__init__(component_config)
+
+        self.pooling_operation = self.component_config["pooling"]
+
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence / tokens."""
         return np.array([t.vector for t in doc])
@@ -35,7 +46,7 @@ def _features_for_doc(self, doc: "Doc") -> np.ndarray:
     def train(
         self,
         training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
@@ -51,6 +62,20 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
 
+    def _calculate_cls_vector(self, features: np.ndarray) -> np.ndarray:
+        # take only non zeros feature vectors into account
+        features = np.array([f for f in features if f.any()])
+
+        if self.pooling_operation == "mean":
+            return np.mean(features, axis=0, keepdims=True)
+        elif self.pooling_operation == "max":
+            return np.max(features, axis=0, keepdims=True)
+        else:
+            raise ValueError(
+                f"Invalid pooling operation specified. Available operations are "
+                f"'mean' or 'max', but provided value is '{self.pooling_operation}'."
+            )
+
     def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
 
@@ -59,7 +84,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE
         if message_attribute_doc is not None:
             features = self._features_for_doc(message_attribute_doc)
 
-            cls_token_vec = np.mean(features, axis=0, keepdims=True)
+            cls_token_vec = self._calculate_cls_vector(features)
             features = np.concatenate([features, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index d9205cac056f..8f03d2d23765 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -48,7 +48,10 @@ def __init__(
         self._add_lookup_table_regexes(lookup_tables)
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         self.known_patterns = training_data.regex_features
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index e13acd4a0312..95e7b9021865 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -171,3 +171,26 @@ def test_spacy_featurizer_train(spacy_nlp):
     vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
 
     assert vecs is None
+
+
+@pytest.mark.parametrize(
+    "pooling, features, expected",
+    [
+        (
+            "mean",
+            np.array([[0.5, 3, 0.4, 0.1], [0, 0, 0, 0], [0.5, 3, 0.4, 0.1]]),
+            np.array([[0.5, 3, 0.4, 0.1]]),
+        ),
+        (
+            "max",
+            np.array([[1.0, 3.0, 0.0, 2.0], [4.0, 3.0, 1.0, 0.0]]),
+            np.array([[4.0, 3.0, 1.0, 2.0]]),
+        ),
+    ],
+)
+def test_calculate_cls_vector(pooling, features, expected):
+    featurizer = SpacyFeaturizer.create({"pooling": pooling}, RasaNLUModelConfig())
+
+    actual = featurizer._calculate_cls_vector(features)
+
+    assert np.all(actual == expected)

From a8e843fefde39a9c894e6c35c1a8e53a0b852989 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 13:50:28 +0100
Subject: [PATCH 236/633] add changelog

---
 changelog/663.feature.rst | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 changelog/663.feature.rst

diff --git a/changelog/663.feature.rst b/changelog/663.feature.rst
new file mode 100644
index 000000000000..5a08ea5b69bd
--- /dev/null
+++ b/changelog/663.feature.rst
@@ -0,0 +1,5 @@
+The document vector of the ``SpacyFeaturizer`` can be calculated using max or mean pooling.
+
+To specify the pooling operation, set the option ``pooling`` for the ``SpacyFeaturizer`` in your configuration file.
+The default pooling operation is ``mean``.
+The mean pooling operation also does not take words into account anymore, that do not have a word vector.
\ No newline at end of file

From c8dff671a5f7c87a8d76a1a5e741c9640293ae0a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 13:53:32 +0100
Subject: [PATCH 237/633] Update documentation

---
 docs/nlu/components.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index b88a44494354..c67803479c72 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -132,11 +132,17 @@ SpacyFeaturizer
 :Description:
     Creates feature for intent classification using the spacy featurizer.
 :Configuration:
+    The document vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
+    mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
+    The default pooling method is set to ``mean``.
 
     .. code-block:: yaml
 
         pipeline:
         - name: "SpacyFeaturizer"
+          # Specify what pooling operation should be used to calculate the vector of
+          # the CLS token. Available options: 'mean' and 'max'
+          "pooling": "mean"
 
 
 ConveRTFeaturizer

From cffc4af584e7b77d0a3c4f1ec4e3c46249e49e4e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 15:41:07 +0100
Subject: [PATCH 238/633] Add bilou utils.

---
 rasa/nlu/constants.py                         |   1 +
 rasa/nlu/extractors/crf_entity_extractor.py   |  72 ++-------
 rasa/nlu/extractors/entity_synonyms.py        |   5 +-
 rasa/nlu/extractors/mitie_entity_extractor.py |   5 +-
 rasa/nlu/utils/bilou_utils.py                 | 133 ++++++++++++++++
 ...ractor.py => test_crf_entity_extractor.py} |   9 +-
 tests/nlu/utils/__init__.py                   |   0
 tests/nlu/utils/test_bilou_utils.py           | 144 ++++++++++++++++++
 8 files changed, 306 insertions(+), 63 deletions(-)
 create mode 100644 rasa/nlu/utils/bilou_utils.py
 rename tests/nlu/extractors/{text_crf_entity_extractor.py => test_crf_entity_extractor.py} (96%)
 create mode 100644 tests/nlu/utils/__init__.py
 create mode 100644 tests/nlu/utils/test_bilou_utils.py

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 17139a7ba2ef..534ba66d87b6 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -7,6 +7,7 @@
 RESPONSE_ATTRIBUTE = "response"
 
 ENTITIES_ATTRIBUTE = "entities"
+BILOU_ENTITIES_ATTRIBUTE = "bilou_entities"
 
 EXTRACTOR_ATTRIBUTE = "extractor"
 
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index d291ffa18f35..7c752c20dfe5 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -5,6 +5,7 @@
 import numpy as np
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
 
+import rasa.nlu.utils.bilou_utils as bilou_utils
 from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -143,7 +144,10 @@ def required_packages(cls) -> List[Text]:
         return ["sklearn_crfsuite", "sklearn"]
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         # checks whether there is at least one
@@ -167,7 +171,7 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         dataset = []
 
         for example in examples:
-            entity_offsets = self._convert_example(example)
+            entity_offsets = bilou_utils.map_message_entities(example)
             dataset.append(self._from_json_to_crf(example, entity_offsets))
 
         return dataset
@@ -194,13 +198,6 @@ def process(self, message: Message, **kwargs: Any) -> None:
             add_to_output=True,
         )
 
-    @staticmethod
-    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
-        def convert_entity(entity):
-            return entity["start"], entity["end"], entity["entity"]
-
-        return [convert_entity(ent) for ent in example.get(ENTITIES_ATTRIBUTE, [])]
-
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
 
@@ -265,16 +262,6 @@ def _create_entity_dict(
             "confidence": confidence,
         }
 
-    @staticmethod
-    def _entity_from_label(label) -> Text:
-        return label[2:]
-
-    @staticmethod
-    def _bilou_from_label(label) -> Optional[Text]:
-        if len(label) >= 2 and label[1] == "-":
-            return label[0].upper()
-        return None
-
     @staticmethod
     def _tokens_without_cls(message: Message) -> List[Token]:
         # [:-1] to remove the CLS token from the list of tokens
@@ -286,7 +273,7 @@ def _find_bilou_end(self, word_idx, entities) -> Any:
 
         # get information about the first word, tagged with `B-...`
         label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = self._entity_from_label(label)
+        entity_label = bilou_utils.entity_name_from_tag(label)
 
         while not finished:
             label, label_confidence = self.most_likely_entity(ent_word_idx, entities)
@@ -323,12 +310,12 @@ def _handle_bilou_label(
         self, word_idx: int, entities: List[Any]
     ) -> Tuple[Any, Any, Any]:
         label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = self._entity_from_label(label)
+        entity_label = bilou_utils.entity_name_from_tag(label)
 
-        if self._bilou_from_label(label) == "U":
+        if bilou_utils.bilou_from_tag(label) == "U":
             return word_idx, confidence, entity_label
 
-        elif self._bilou_from_label(label) == "B":
+        elif bilou_utils.bilou_from_tag(label) == "B":
             # start of multi word-entity need to represent whole extent
             ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
             return ent_word_idx, confidence, entity_label
@@ -509,7 +496,7 @@ def _from_json_to_crf(
             ents = [l[5] for l in gold.orig_annot]
         else:
             doc_or_tokens = self._tokens_without_cls(message)
-            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
+            ents = bilou_utils.bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
 
         # collect badly annotated examples
         collected = []
@@ -531,45 +518,12 @@ def _from_json_to_crf(
 
         if not self.component_config["BILOU_flag"]:
             for i, label in enumerate(ents):
-                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
+                if bilou_utils.bilou_from_tag(label) in {"B", "I", "U", "L"}:
                     # removes BILOU prefix from label
-                    ents[i] = self._entity_from_label(label)
+                    ents[i] = bilou_utils.entity_name_from_tag(label)
 
         return self._from_text_to_crf(message, ents)
 
-    @staticmethod
-    def _bilou_tags_from_offsets(tokens, entities, missing: Text = "O") -> List[Text]:
-        # From spacy.spacy.GoldParse, under MIT License
-        starts = {token.start: i for i, token in enumerate(tokens)}
-        ends = {token.end: i for i, token in enumerate(tokens)}
-        bilou = ["-" for _ in tokens]
-        # Handle entity cases
-        for start_char, end_char, label in entities:
-            start_token = starts.get(start_char)
-            end_token = ends.get(end_char)
-            # Only interested if the tokenization is correct
-            if start_token is not None and end_token is not None:
-                if start_token == end_token:
-                    bilou[start_token] = "U-%s" % label
-                else:
-                    bilou[start_token] = "B-%s" % label
-                    for i in range(start_token + 1, end_token):
-                        bilou[i] = "I-%s" % label
-                    bilou[end_token] = "L-%s" % label
-        # Now distinguish the O cases from ones where we miss the tokenization
-        entity_chars = set()
-        for start_char, end_char, label in entities:
-            for i in range(start_char, end_char):
-                entity_chars.add(i)
-        for n, token in enumerate(tokens):
-            for i in range(token.start, token.end):
-                if i in entity_chars:
-                    break
-            else:
-                bilou[n] = missing
-
-        return bilou
-
     @staticmethod
     def __pattern_of_token(message: Message, i: int) -> Dict:
         if message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
diff --git a/rasa/nlu/extractors/entity_synonyms.py b/rasa/nlu/extractors/entity_synonyms.py
index 500027711621..f46096891817 100644
--- a/rasa/nlu/extractors/entity_synonyms.py
+++ b/rasa/nlu/extractors/entity_synonyms.py
@@ -26,7 +26,10 @@ def __init__(
         self.synonyms = synonyms if synonyms else {}
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         for key, value in list(training_data.entity_synonyms.items()):
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 7bd00b0fb799..e0d5a271eb3f 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -63,7 +63,10 @@ def extract_entities(
         return ents
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         import mitie
 
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
new file mode 100644
index 000000000000..eb6823045d72
--- /dev/null
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -0,0 +1,133 @@
+from typing import List, Tuple, Text, Optional, Dict
+
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.training_data import Message
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.constants import (
+    ENTITIES_ATTRIBUTE,
+    TOKENS_NAMES,
+    TEXT_ATTRIBUTE,
+    BILOU_ENTITIES_ATTRIBUTE,
+)
+
+BILOU_PREFIXES = ["B-", "I-", "U-", "L-"]
+
+
+def entity_name_from_tag(tag: Text) -> Text:
+    """Remove the BILOU prefix from the given tag."""
+    if tag[:2] in BILOU_PREFIXES:
+        return tag[2:]
+    return tag
+
+
+def bilou_from_tag(tag: Text) -> Optional[Text]:
+    """Get the BILOU prefix (without -) from the given tag."""
+    if len(tag) >= 2 and tag[1] == "-" and tag[:2] in BILOU_PREFIXES:
+        return tag[0].upper()
+    return None
+
+
+def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
+    """Maps the entity tags of the message to the ids of the provided dict."""
+    if message.get(BILOU_ENTITIES_ATTRIBUTE):
+        _tags = [
+            tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict["O"]
+            for _tag in message.get(BILOU_ENTITIES_ATTRIBUTE)
+        ]
+    else:
+        _tags = [tag_id_dict["O"] for _ in message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])]
+
+    return _tags
+
+
+def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
+    """Remove the BILOU prefixes from the given tags."""
+    return [entity_name_from_tag(t) for t in tags]
+
+
+def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
+    """Create a mapping of unique tags to ids."""
+    distinct_tag_ids = set(
+        [
+            entity_name_from_tag(e)
+            for example in training_data.training_examples
+            if example.get(BILOU_ENTITIES_ATTRIBUTE)
+            for e in example.get(BILOU_ENTITIES_ATTRIBUTE)
+        ]
+    ) - {"O"}
+
+    tag_id_dict = {
+        f"{prefix}{tag_id}": idx_1 * len(BILOU_PREFIXES) + idx_2 + 1
+        for idx_1, tag_id in enumerate(sorted(distinct_tag_ids))
+        for idx_2, prefix in enumerate(BILOU_PREFIXES)
+    }
+    tag_id_dict["O"] = 0
+
+    return tag_id_dict
+
+
+def apply_bilou_schema(training_data: TrainingData):
+    """Obtains a list of BILOU entity tags and sets them on the corresponding
+    message."""
+    for message in training_data.training_examples:
+        entities = message.get(ENTITIES_ATTRIBUTE)
+
+        if not entities:
+            continue
+
+        entities = map_message_entities(message)
+        output = bilou_tags_from_offsets(
+            message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), entities
+        )
+
+        message.set(BILOU_ENTITIES_ATTRIBUTE, output)
+
+
+def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
+    """Maps the entities of the given message to their start, end, and tag values."""
+
+    def convert_entity(entity):
+        return entity["start"], entity["end"], entity["entity"]
+
+    return [convert_entity(entity) for entity in message.get(ENTITIES_ATTRIBUTE, [])]
+
+
+def bilou_tags_from_offsets(
+    tokens: List[Token], entities: List[Tuple[int, int, Text]], missing: Text = "O"
+) -> List[Text]:
+    """Creates a list of BILOU tags for the given list of tokens and entities."""
+
+    # From spacy.spacy.GoldParse, under MIT License
+    starts = {token.start: i for i, token in enumerate(tokens)}
+    ends = {token.end: i for i, token in enumerate(tokens)}
+    bilou = ["-" for _ in tokens]
+
+    # Handle entity cases
+    for start_char, end_char, label in entities:
+        start_token = starts.get(start_char)
+        end_token = ends.get(end_char)
+
+        # Only interested if the tokenization is correct
+        if start_token is not None and end_token is not None:
+            if start_token == end_token:
+                bilou[start_token] = "U-%s" % label
+            else:
+                bilou[start_token] = "B-%s" % label
+                for i in range(start_token + 1, end_token):
+                    bilou[i] = "I-%s" % label
+                bilou[end_token] = "L-%s" % label
+
+    # Now distinguish the O cases from ones where we miss the tokenization
+    entity_chars = set()
+    for start_char, end_char, label in entities:
+        for i in range(start_char, end_char):
+            entity_chars.add(i)
+
+    for n, token in enumerate(tokens):
+        for i in range(token.start, token.end):
+            if i in entity_chars:
+                break
+        else:
+            bilou[n] = missing
+
+    return bilou
diff --git a/tests/nlu/extractors/text_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
similarity index 96%
rename from tests/nlu/extractors/text_crf_entity_extractor.py
rename to tests/nlu/extractors/test_crf_entity_extractor.py
index 1ff19ba338de..8c832894fd25 100644
--- a/tests/nlu/extractors/text_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -1,3 +1,4 @@
+from rasa.nlu.constants import TEXT_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import TrainingData, Message
 
@@ -172,6 +173,7 @@ def test_crf_create_entity_dict(spacy_nlp):
                             },
                         }
                     ],
+                    "spacy_doc": spacy_nlp("where is St. Michael's Hospital?"),
                 },
             )
         },
@@ -196,14 +198,17 @@ def test_crf_create_entity_dict(spacy_nlp):
                             },
                         }
                     ],
+                    "spacy_doc": spacy_nlp("where is Children's Hospital?"),
                 },
             )
         },
     ]
     for ex in examples:
         # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
-        spacy_tokens = spacy_tokenizer.tokenize(spacy_nlp(ex["message"].text))
-        white_space_tokens = white_space_tokenizer.tokenize(ex["message"].text)
+        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT_ATTRIBUTE)
+        white_space_tokens = white_space_tokenizer.tokenize(
+            ex["message"], TEXT_ATTRIBUTE
+        )
         for tokenizer, tokens in [
             ("SpacyTokenizer", spacy_tokens),
             ("WhitespaceTokenizer", white_space_tokens),
diff --git a/tests/nlu/utils/__init__.py b/tests/nlu/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
new file mode 100644
index 000000000000..f1ff8a11fdef
--- /dev/null
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -0,0 +1,144 @@
+import pytest
+
+import rasa.nlu.utils.bilou_utils as bilou_utils
+from nlu.constants import BILOU_ENTITIES_ATTRIBUTE, ENTITIES_ATTRIBUTE
+from nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from nlu.training_data import TrainingData
+from rasa.nlu.training_data import Message
+
+
+@pytest.mark.parametrize(
+    "tag, expected",
+    [
+        ("B-person", "person"),
+        ("I-location", "location"),
+        ("location", "location"),
+        ("U-company", "company"),
+        ("L-company", "company"),
+    ],
+)
+def test_entity_name_from_tag(tag, expected):
+    actual = bilou_utils.entity_name_from_tag(tag)
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "tag, expected",
+    [
+        ("B-person", "B"),
+        ("I-location", "I"),
+        ("location", None),
+        ("U-company", "U"),
+        ("L-company", "L"),
+        ("O-company", None),
+    ],
+)
+def test_bilou_from_tag(tag, expected):
+    actual = bilou_utils.bilou_from_tag(tag)
+
+    assert actual == expected
+
+
+def test_tags_to_ids():
+    message = Message("Germany is part of the European Union")
+    message.set(
+        BILOU_ENTITIES_ATTRIBUTE,
+        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
+    )
+
+    tag_id_dict = {"O": 0, "U-location": 1, "B-organisation": 2, "L-organisation": 3}
+
+    tags = bilou_utils.tags_to_ids(message, tag_id_dict)
+
+    assert tags == [1, 0, 0, 0, 0, 2, 3]
+
+
+def test_remove_bilou_prefixes():
+    actual = bilou_utils.remove_bilou_prefixes(
+        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"]
+    )
+
+    assert actual == ["location", "O", "O", "O", "O", "organisation", "organisation"]
+
+
+def test_build_tag_id_dict():
+    message_1 = Message("Germany is part of the European Union")
+    message_1.set(
+        BILOU_ENTITIES_ATTRIBUTE,
+        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
+    )
+
+    message_2 = Message("Berlin is the capital of Germany")
+    message_2.set(
+        BILOU_ENTITIES_ATTRIBUTE, ["U-location", "O", "O", "O", "O", "U-location"]
+    )
+
+    training_data = TrainingData([message_1, message_2])
+
+    tag_id_dict = bilou_utils.build_tag_id_dict(training_data)
+
+    assert tag_id_dict == {
+        "O": 0,
+        "B-location": 1,
+        "I-location": 2,
+        "U-location": 3,
+        "L-location": 4,
+        "B-organisation": 5,
+        "I-organisation": 6,
+        "U-organisation": 7,
+        "L-organisation": 8,
+    }
+
+
+def test_apply_bilou_schema():
+    tokenizer = WhitespaceTokenizer()
+
+    message_1 = Message("Germany is part of the European Union")
+    message_1.set(
+        ENTITIES_ATTRIBUTE,
+        [
+            {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
+            {
+                "start": 23,
+                "end": 37,
+                "value": "European Union",
+                "entity": "organisation",
+            },
+        ],
+    )
+
+    message_2 = Message("Berlin is the capital of Germany")
+    message_2.set(
+        ENTITIES_ATTRIBUTE,
+        [
+            {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
+            {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
+        ],
+    )
+
+    training_data = TrainingData([message_1, message_2])
+
+    tokenizer.train(training_data)
+
+    bilou_utils.apply_bilou_schema(training_data)
+
+    assert message_1.get(BILOU_ENTITIES_ATTRIBUTE) == [
+        "U-location",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-organisation",
+        "L-organisation",
+        "O",
+    ]
+    assert message_2.get(BILOU_ENTITIES_ATTRIBUTE) == [
+        "U-location",
+        "O",
+        "O",
+        "O",
+        "O",
+        "U-location",
+        "O",
+    ]

From 4af49a7f6382673e233792b734fa1d2f02e842ac Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 16:28:25 +0100
Subject: [PATCH 239/633] Add BILOU option to EmbeddingIntentClassifier.

---
 .../embedding_intent_classifier.py            | 31 ++++++--
 rasa/nlu/extractors/crf_entity_extractor.py   |  6 +-
 rasa/nlu/utils/bilou_utils.py                 | 71 ++++++++++++-------
 rasa/utils/tensorflow/constants.py            |  2 +
 tests/nlu/utils/test_bilou_utils.py           |  2 +-
 5 files changed, 78 insertions(+), 34 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 707b387d0c03..baa5ea8b2a5a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -10,6 +10,7 @@
 from typing import Any, Dict, List, Optional, Text, Tuple, Union
 
 import rasa.utils.io
+import rasa.nlu.utils.bilou_utils as bilou_utils
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -65,6 +66,7 @@
     MU_NEG,
     MU_POS,
     EMBED_DIM,
+    BILOU_FLAG,
 )
 
 logger = logging.getLogger(__name__)
@@ -175,6 +177,8 @@ class EmbeddingIntentClassifier(EntityExtractor):
         ENTITY_RECOGNITION: True,
         MASKED_LM: False,
         SPARSE_INPUT_DROPOUT: True,
+        # if true BILOU schema is used for entities
+        BILOU_FLAG: False,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -249,10 +253,12 @@ def _create_label_id_dict(
             label_id: idx for idx, label_id in enumerate(sorted(distinct_label_ids))
         }
 
-    @staticmethod
-    def _create_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
+    def _create_tag_id_dict(self, training_data: TrainingData) -> Dict[Text, int]:
         """Create label_id dictionary"""
 
+        if self.component_config[BILOU_FLAG]:
+            return bilou_utils.build_tag_id_dict(training_data)
+
         distinct_tag_ids = set(
             [
                 e["entity"]
@@ -454,10 +460,15 @@ def _create_model_data(
                     label_ids.append(label_id_dict[e.get(label_attribute)])
 
             if self.component_config[ENTITY_RECOGNITION] and tag_id_dict:
-                _tags = []
-                for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
-                    _tag = determine_token_labels(t, e.get(ENTITIES_ATTRIBUTE), None)
-                    _tags.append(tag_id_dict[_tag])
+                if self.component_config[BILOU_FLAG]:
+                    _tags = bilou_utils.tags_to_ids(e, tag_id_dict)
+                else:
+                    _tags = []
+                    for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
+                        _tag = determine_token_labels(
+                            t, e.get(ENTITIES_ATTRIBUTE), None
+                        )
+                        _tags.append(tag_id_dict[_tag])
                 # transpose to have seq_len x 1
                 tag_ids.append(np.array([_tags]).T)
 
@@ -493,6 +504,9 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
 
         Performs sanity checks on training data, extracts encodings for labels.
         """
+        if self.component_config[BILOU_FLAG]:
+            bilou_utils.apply_bilou_schema(training_data)
+
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=INTENT_ATTRIBUTE
         )
@@ -655,6 +669,11 @@ def _predict_entities(
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
 
+        print(tags)
+
+        if self.component_config[BILOU_FLAG]:
+            tags = bilou_utils.remove_bilou_prefixes(tags)
+
         entities = self._convert_tags_to_entities(
             message.text, message.get("tokens", []), tags
         )
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 7c752c20dfe5..c154f34de7dd 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -312,10 +312,10 @@ def _handle_bilou_label(
         label, confidence = self.most_likely_entity(word_idx, entities)
         entity_label = bilou_utils.entity_name_from_tag(label)
 
-        if bilou_utils.bilou_from_tag(label) == "U":
+        if bilou_utils.bilou_prefix_from_tag(label) == "U":
             return word_idx, confidence, entity_label
 
-        elif bilou_utils.bilou_from_tag(label) == "B":
+        elif bilou_utils.bilou_prefix_from_tag(label) == "B":
             # start of multi word-entity need to represent whole extent
             ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
             return ent_word_idx, confidence, entity_label
@@ -518,7 +518,7 @@ def _from_json_to_crf(
 
         if not self.component_config["BILOU_flag"]:
             for i, label in enumerate(ents):
-                if bilou_utils.bilou_from_tag(label) in {"B", "I", "U", "L"}:
+                if bilou_utils.bilou_prefix_from_tag(label) in {"B", "I", "U", "L"}:
                     # removes BILOU prefix from label
                     ents[i] = bilou_utils.entity_name_from_tag(label)
 
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index eb6823045d72..019117c06a93 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Text, Optional, Dict
+from typing import List, Tuple, Text, Optional, Dict, Set
 
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
@@ -20,7 +20,7 @@ def entity_name_from_tag(tag: Text) -> Text:
     return tag
 
 
-def bilou_from_tag(tag: Text) -> Optional[Text]:
+def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
     """Get the BILOU prefix (without -) from the given tag."""
     if len(tag) >= 2 and tag[1] == "-" and tag[:2] in BILOU_PREFIXES:
         return tag[0].upper()
@@ -47,7 +47,7 @@ def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
 
 def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
     """Create a mapping of unique tags to ids."""
-    distinct_tag_ids = set(
+    distinct_tags = set(
         [
             entity_name_from_tag(e)
             for example in training_data.training_examples
@@ -57,8 +57,8 @@ def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
     ) - {"O"}
 
     tag_id_dict = {
-        f"{prefix}{tag_id}": idx_1 * len(BILOU_PREFIXES) + idx_2 + 1
-        for idx_1, tag_id in enumerate(sorted(distinct_tag_ids))
+        f"{prefix}{tag}": idx_1 * len(BILOU_PREFIXES) + idx_2 + 1
+        for idx_1, tag in enumerate(sorted(distinct_tags))
         for idx_2, prefix in enumerate(BILOU_PREFIXES)
     }
     tag_id_dict["O"] = 0
@@ -98,36 +98,59 @@ def bilou_tags_from_offsets(
     """Creates a list of BILOU tags for the given list of tokens and entities."""
 
     # From spacy.spacy.GoldParse, under MIT License
-    starts = {token.start: i for i, token in enumerate(tokens)}
-    ends = {token.end: i for i, token in enumerate(tokens)}
+
+    start_pos_to_token_idx = {token.start: i for i, token in enumerate(tokens)}
+    end_pos_to_token_idx = {token.end: i for i, token in enumerate(tokens)}
+
     bilou = ["-" for _ in tokens]
 
     # Handle entity cases
-    for start_char, end_char, label in entities:
-        start_token = starts.get(start_char)
-        end_token = ends.get(end_char)
+    _handle_entities(bilou, entities, end_pos_to_token_idx, start_pos_to_token_idx)
+
+    # Now distinguish the O cases from ones where we miss the tokenization
+    entity_positions = _get_entity_positions(entities)
+    _handle_not_an_entity(bilou, tokens, entity_positions, missing)
+
+    return bilou
+
+
+def _handle_entities(
+    bilou: List[Text],
+    entities: List[Tuple[int, int, Text]],
+    end_pos_to_token_idx: Dict[int, int],
+    start_pos_to_token_idx: Dict[int, int],
+):
+    for start_pos, end_pos, label in entities:
+        start_token_idx = start_pos_to_token_idx.get(start_pos)
+        end_token_idx = end_pos_to_token_idx.get(end_pos)
 
         # Only interested if the tokenization is correct
-        if start_token is not None and end_token is not None:
-            if start_token == end_token:
-                bilou[start_token] = "U-%s" % label
+        if start_token_idx is not None and end_token_idx is not None:
+            if start_token_idx == end_token_idx:
+                bilou[start_token_idx] = "U-%s" % label
             else:
-                bilou[start_token] = "B-%s" % label
-                for i in range(start_token + 1, end_token):
+                bilou[start_token_idx] = "B-%s" % label
+                for i in range(start_token_idx + 1, end_token_idx):
                     bilou[i] = "I-%s" % label
-                bilou[end_token] = "L-%s" % label
+                bilou[end_token_idx] = "L-%s" % label
 
-    # Now distinguish the O cases from ones where we miss the tokenization
-    entity_chars = set()
-    for start_char, end_char, label in entities:
-        for i in range(start_char, end_char):
-            entity_chars.add(i)
 
+def _get_entity_positions(entities: List[Tuple[int, int, Text]]) -> Set[int]:
+    entity_positions = set()
+
+    for start_pos, end_pos, label in entities:
+        for i in range(start_pos, end_pos):
+            entity_positions.add(i)
+
+    return entity_positions
+
+
+def _handle_not_an_entity(
+    bilou: List[Text], tokens: List[Token], entity_positions: Set[int], missing: Text
+):
     for n, token in enumerate(tokens):
         for i in range(token.start, token.end):
-            if i in entity_chars:
+            if i in entity_positions:
                 break
         else:
             bilou[n] = missing
-
-    return bilou
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index a47b9885aaf9..b7b3c68c2a5a 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -46,3 +46,5 @@
 SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
 
 RANKING_LENGTH = "ranking_length"
+
+BILOU_FLAG = "BILOU_flag"
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
index f1ff8a11fdef..bc2e9c0b9fc8 100644
--- a/tests/nlu/utils/test_bilou_utils.py
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -35,7 +35,7 @@ def test_entity_name_from_tag(tag, expected):
     ],
 )
 def test_bilou_from_tag(tag, expected):
-    actual = bilou_utils.bilou_from_tag(tag)
+    actual = bilou_utils.bilou_prefix_from_tag(tag)
 
     assert actual == expected
 

From c11a077ebaef57044062f25efb7ff806c41146ca Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 29 Jan 2020 17:33:04 +0100
Subject: [PATCH 240/633] fix more tests

---
 rasa/nlu/classifiers/DIET_classifier.py       | 19 ++++++++++++-------
 .../selectors/embedding_response_selector.py  |  2 +-
 tests/nlu/base/test_evaluation.py             |  9 ++++++---
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index 16417c905b35..abe56bc43cf2 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -439,11 +439,16 @@ def _create_model_data(
         tag_ids = []
 
         for e in training_data:
-            _sparse, _dense = self._extract_and_add_features(e, TEXT_ATTRIBUTE)
-            if _sparse is not None:
-                X_sparse.append(_sparse)
-            if _dense is not None:
-                X_dense.append(_dense)
+            if (
+                label_attribute is None
+                or label_attribute == INTENT_ATTRIBUTE
+                or e.get(label_attribute)
+            ):
+                _sparse, _dense = self._extract_and_add_features(e, TEXT_ATTRIBUTE)
+                if _sparse is not None:
+                    X_sparse.append(_sparse)
+                if _dense is not None:
+                    X_dense.append(_dense)
 
             if e.get(label_attribute):
                 _sparse, _dense = self._extract_and_add_features(e, label_attribute)
@@ -821,7 +826,7 @@ def __init__(
         self,
         data_signature: Dict[Text, List[FeatureSignature]],
         label_data: RasaModelData,
-        inverted_tag_dict: Dict[int, Text],
+        inverted_tag_dict: Optional[Dict[int, Text]],
         config: Dict[Text, Any],
     ) -> None:
         super().__init__(name="DIET", random_seed=config[RANDOM_SEED])
@@ -836,7 +841,7 @@ def __init__(
         self.tf_label_data = self.batch_to_model_data_format(
             label_batch, label_data.get_signature()
         )
-        self._num_tags = len(inverted_tag_dict)
+        self._num_tags = len(inverted_tag_dict) if inverted_tag_dict is not None else 0
 
         self.config = config
 
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 3f3356354ca6..4c76ed6bfb49 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -208,8 +208,8 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=RESPONSE_ATTRIBUTE
         )
-
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
+
         self._label_data = self._create_label_data(
             training_data, label_id_dict, attribute=RESPONSE_ATTRIBUTE
         )
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index d0fdddc15194..2c5a76a8c217 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -329,9 +329,12 @@ def test_run_cv_evaluation_with_response_selector():
     assert len(response_selection_results.test["Accuracy"]) == n_folds
     assert len(response_selection_results.test["Precision"]) == n_folds
     assert len(response_selection_results.test["F1-score"]) == n_folds
-    # No entity extractor in pipeline
-    assert len(entity_results.train) == 0
-    assert len(entity_results.test) == 0
+    assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
 
 
 def test_response_selector_present():

From 0ee3cf441ef8437b03bb855de729aaff20b7d654 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:11:03 +0100
Subject: [PATCH 241/633] Fix random seed.

---
 rasa/core/policies/TED_policy.py        | 4 +++-
 rasa/nlu/classifiers/DIET_classifier.py | 4 ++++
 rasa/utils/tensorflow/tf_layers.py      | 9 +++++++--
 rasa/utils/tensorflow/tf_models.py      | 7 ++++---
 requirements.txt                        | 3 ---
 tests/core/test_processor.py            | 2 +-
 tests/core/test_training.py             | 3 +--
 tests/nlu/training/test_train.py        | 2 --
 8 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index 26160751014a..fd02d006c6a2 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -92,7 +92,7 @@ class TEDPolicy(Policy):
         # number of epochs
         EPOCHS: 1,
         # set random seed to any int to get reproducible results
-        RANDOM_SEED: 42,
+        RANDOM_SEED: None,
         # embedding parameters
         # dimension size of embedding vectors
         EMBED_DIM: 20,
@@ -479,6 +479,8 @@ def _prepare_layers(self) -> None:
             self.config[USE_MAX_SIM_NEG],
             self.config[C_EMB],
             self.config[SCALE_LOSS],
+            # set to 1 to get deterministic behaviour
+            parallel_iterations=1 if self.random_seed is not None else 1000,
         )
         self._tf_layers["ffnn.dialogue"] = tf_layers.ReluFfn(
             self.config[HIDDEN_LAYERS_SIZES_DIALOGUE],
diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index 37a8ade901a2..bf5126d46c06 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -970,6 +970,8 @@ def _prepare_mask_lm_layers(self) -> None:
             self.config[USE_MAX_SIM_NEG],
             self.config[C_EMB],
             self.config[SCALE_LOSS],
+            # set to 1 to get deterministic behaviour
+            parallel_iterations=1 if self.random_seed is not None else 1000,
         )
 
     def _prepare_intent_classification_layers(self) -> None:
@@ -993,6 +995,8 @@ def _prepare_intent_classification_layers(self) -> None:
             self.config[USE_MAX_SIM_NEG],
             self.config[C_EMB],
             self.config[SCALE_LOSS],
+            # set to 1 to get deterministic behaviour
+            parallel_iterations=1 if self.random_seed is not None else 1000,
         )
 
     def _prepare_entity_recognition_layers(self) -> None:
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 9f43ac166f57..e3e2ea42bbd3 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -452,6 +452,8 @@ def __init__(
         neg_lambda: float,
         scale_loss: bool,
         name: Text = None,
+        parallel_iterations: int = 1000,
+        same_sampling: bool = False,
     ) -> None:
         super().__init__(name=name)
         self.num_neg = num_neg
@@ -461,6 +463,8 @@ def __init__(
         self.use_max_sim_neg = use_max_sim_neg
         self.neg_lambda = neg_lambda
         self.scale_loss = scale_loss
+        self.parallel_iterations = parallel_iterations
+        self.same_sampling = same_sampling
 
     @staticmethod
     def _make_flat(x: tf.Tensor) -> tf.Tensor:
@@ -476,7 +480,8 @@ def rand_idxs():
                 tf.random.shuffle(tf.range(total_candidates))[: self.num_neg], 0
             )
 
-        # return tf.tile(rand_idxs(), (batch_size, 1))
+        if self.same_sampling:
+            return tf.tile(rand_idxs(), (batch_size, 1))
 
         def cond(i, out):
             """Condition for while loop"""
@@ -501,7 +506,7 @@ def body(i, out):
             body,
             loop_vars=[i1, out1],
             shape_invariants=[i1.shape, tf.TensorShape([None, self.num_neg])],
-            parallel_iterations=1000,
+            parallel_iterations=self.parallel_iterations,
             back_prop=False,
         )[1]
 
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index ba207fe9e23d..3761c660e086 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -29,9 +29,6 @@ def __init__(self, random_seed: Optional[int], *args, **kwargs):
 
         self.random_seed = random_seed
 
-        tf.random.set_seed(random_seed)
-        np.random.seed(random_seed)
-
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
@@ -55,6 +52,9 @@ def fit(
     ) -> None:
         """Fit model data"""
 
+        tf.random.set_seed(self.random_seed)
+        np.random.seed(self.random_seed)
+
         disable = silent or is_logging_disabled()
 
         evaluation_model_data = None
@@ -173,6 +173,7 @@ def load(
         )
         # load trained weights
         model.load_weights(model_file_name)
+
         logger.debug("Finished loading the model.")
         return model
 
diff --git a/requirements.txt b/requirements.txt
index 528e7141635a..f58d5afa0ddb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -58,9 +58,6 @@ sklearn-crfsuite==0.3.6
 psycopg2-binary==2.8.2
 PyJWT==1.7.1
 python-dateutil==2.8.0
-# remove when tensorflow@1.15.x or a pre-release patch is released
-# https://github.com/tensorflow/tensorflow/issues/32319
-gast==0.2.2
 # for new featurizers
 tensorflow==2.1.0
 tensorflow_hub==0.7.0
diff --git a/tests/core/test_processor.py b/tests/core/test_processor.py
index 6641e948d25c..d0e9bf16faf8 100644
--- a/tests/core/test_processor.py
+++ b/tests/core/test_processor.py
@@ -424,7 +424,7 @@ async def test_reminder_restart(
         # last user event is way in the past
         (UserUttered(timestamp=1), 60, True),
         # user event are very recent
-        (UserUttered("hello", timestamp=time.time()), 60, False),
+        (UserUttered("hello", timestamp=time.time()), 120, False),
         # there is user event
         (ActionExecuted(ACTION_LISTEN_NAME, timestamp=time.time()), 60, False),
         # Old event, but sessions are disabled
diff --git a/tests/core/test_training.py b/tests/core/test_training.py
index 2744dd5ac4b7..c61847392073 100644
--- a/tests/core/test_training.py
+++ b/tests/core/test_training.py
@@ -116,12 +116,11 @@ def configs_for_random_seed_test():
     ]
 
 
-# TODO fix random seed
-@pytest.mark.skip(reason="we need to fix the random seed first")
 @pytest.mark.parametrize("config_file", configs_for_random_seed_test())
 async def test_random_seed(tmpdir, config_file):
     # set random seed in config file to
     # generate a reproducible training result
+
     agent_1 = await train(
         DEFAULT_DOMAIN_PATH_WITH_SLOTS,
         DEFAULT_STORIES_FILE,
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 5750d4940ba9..d88436564e2e 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -93,8 +93,6 @@ async def test_train_model(pipeline_template, component_builder, tmpdir):
     assert loaded.parse("Hello today is Monday, again!") is not None
 
 
-# TODO fix random seed
-@pytest.mark.skip(reason="we need to fix the random seed first")
 async def test_random_seed(component_builder, tmpdir):
     """test if train result is the same for two runs of tf embedding"""
 

From ba4099632c36be39db38e7cbc2f4799598568aea Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:16:59 +0100
Subject: [PATCH 242/633] Skip tf config test.

---
 tests/core/test_policies.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 549b25d41e86..e9a3bda513c0 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -214,14 +214,14 @@ def create_policy(self, featurizer, priority):
         p = KerasPolicy(featurizer, priority, **tf_defaults())
         return p
 
-    # TODO test tf config
-    # def test_tf_config(self, trained_policy, tmpdir):
-    #     # noinspection PyProtectedMember
-    #     assert trained_policy.session._config == session_config()
-    #     trained_policy.persist(tmpdir.strpath)
-    #     loaded = trained_policy.__class__.load(tmpdir.strpath)
-    #     # noinspection PyProtectedMember
-    #     assert loaded.session._config == session_config()
+    @pytest.mark.skip(reason="We need to fix tf.config!")
+    def test_tf_config(self, trained_policy, tmpdir):
+        # noinspection PyProtectedMember
+        assert trained_policy.session._config == session_config()
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        # noinspection PyProtectedMember
+        assert loaded.session._config == session_config()
 
 
 class TestSklearnPolicy(PolicyTestCollection):

From bfa4ca4ea80a7e831986ff5d8dc65129594ef3ea Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 30 Jan 2020 13:27:30 +0100
Subject: [PATCH 243/633] use gelu, use sparse weights in dense layers

---
 rasa/core/policies/embedding_policy.py        |  6 ++-
 .../embedding_intent_classifier.py            |  8 +--
 rasa/utils/tensorflow/tf_layers.py            | 53 +++++++++++++------
 3 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 2a1764d2de40..fa8eff39a6b6 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import tensorflow as tf
+import tensorflow_addons as tfa
 
 from typing import Any, List, Optional, Text, Dict, Tuple, Union
 
@@ -477,13 +478,13 @@ def _prepare_layers(self) -> None:
             self.config[C_EMB],
             self.config[SCALE_LOSS],
         )
-        self._tf_layers["ffnn.dialogue"] = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.dialogue"] = tf_layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_DIALOGUE],
             self.config[DROPRATE_DIALOGUE],
             self.config[C2],
             layer_name_suffix="dialogue",
         )
-        self._tf_layers["ffnn.label"] = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE_LABEL],
             self.config[C2],
@@ -530,6 +531,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
         dialogue_transformed = self._tf_layers["transformer"](
             dialogue, 1 - tf.expand_dims(mask, axis=-1), self._training
         )
+        dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
         if self.max_history_tracker_featurizer_used:
             # pick last label if max history featurizer is used
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 707b387d0c03..ef1838bfe1df 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -920,13 +920,13 @@ def _prepare_sequence_layers(self) -> None:
             self.config[C2],
             self.config[DENSE_DIM]["label"],
         )
-        self._tf_layers["ffnn.text"] = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.text"] = tf_layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_TEXT],
             self.config[DROPRATE],
             self.config[C2],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
         )
-        self._tf_layers["ffnn.label"] = tf_layers.ReluFfn(
+        self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE],
             self.config[C2],
@@ -1060,7 +1060,7 @@ def _create_sequence(
             pre, lm_mask_bool = (x, None)
 
         transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
-        transformed = tf.nn.relu(transformed)
+        transformed = tfa.activations.gelu(transformed)
 
         return transformed, x, lm_mask_bool
 
@@ -1138,7 +1138,7 @@ def _entity_loss(
         return loss, f1
 
     def batch_loss(
-        self, batch_in: Union[List[tf.Tensor], List[np.ndarray]]
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 04d880fb2e91..841ae452efd6 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -26,10 +26,30 @@ def dropped_inputs():
         return outputs
 
 
+class DenseWithSparseWeights(tf.keras.layers.Dense):
+    def __init__(self, sparsity: int = 0.8, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.sparsity = sparsity
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        super().build(input_shape)
+        # create random mask to set weights to 0
+        kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
+        kernel_mask = tf.cast(
+            tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
+        )
+        self.kernel_mask = tf.Variable(initial_value=kernel_mask, trainable=False)
+
+    def call(self, inputs: tf.Tensor) -> tf.Tensor:
+        if self.sparsity:
+            # set some weights to 0
+            self.kernel.assign(self.kernel * self.kernel_mask)
+        return super().call(inputs)
+
+
 class DenseForSparse(tf.keras.layers.Dense):
     """Dense layer for sparse input tensor"""
 
-    # noinspection PyPep8Naming
     def __init__(self, reg_lambda: float, **kwargs) -> None:
         l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
 
@@ -57,7 +77,7 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         return outputs
 
 
-class ReluFfn(tf.keras.layers.Layer):
+class Ffnn(tf.keras.layers.Layer):
     """Create feed-forward network with hidden layers and name suffix."""
 
     def __init__(
@@ -73,9 +93,9 @@ def __init__(
         self._ffn_layers = []
         for i, layer_size in enumerate(layer_sizes):
             self._ffn_layers.append(
-                tf.keras.layers.Dense(
+                DenseWithSparseWeights(
                     units=layer_size,
-                    activation="relu",
+                    activation=tfa.activations.gelu,
                     kernel_regularizer=l2_regularizer,
                     name=f"hidden_layer_{layer_name_suffix}_{i}",
                 )
@@ -178,10 +198,10 @@ def __init__(self, d_model: int, num_heads: int) -> None:
 
         self._depth = d_model // self.num_heads
 
-        self._wq = tf.keras.layers.Dense(d_model, use_bias=False)
-        self._wk = tf.keras.layers.Dense(d_model, use_bias=False)
-        self._wv = tf.keras.layers.Dense(d_model, use_bias=False)
-        self._dense = tf.keras.layers.Dense(d_model)
+        self._wq = DenseWithSparseWeights(units=d_model, use_bias=False)
+        self._wk = DenseWithSparseWeights(units=d_model, use_bias=False)
+        self._wv = DenseWithSparseWeights(units=d_model, use_bias=False)
+        self._dense = DenseWithSparseWeights(units=d_model)
 
     def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
         """Split the last dimension into (num_heads, depth).
@@ -249,9 +269,11 @@ def __init__(
 
         self._ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
-            tf.keras.layers.Dense(dff, activation="relu"),  # (batch_size, seq_len, dff)
+            DenseWithSparseWeights(
+                units=dff, activation=tfa.activations.gelu
+            ),  # (batch_size, seq_len, dff)
             tf.keras.layers.Dropout(rate),
-            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
+            DenseWithSparseWeights(units=d_model),  # (batch_size, seq_len, d_model)
             tf.keras.layers.Dropout(rate),
         ]
 
@@ -317,7 +339,7 @@ def __init__(
         self.unidirectional = unidirectional
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self._embedding = tf.keras.layers.Dense(
+        self._embedding = DenseWithSparseWeights(
             units=d_model, kernel_regularizer=l2_regularizer
         )
 
@@ -376,12 +398,9 @@ def call(
         lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
 
         def x_masked():
-            x_random_pad = (
-                tf.random.uniform(
-                    tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype
-                )
-                * (1 - mask)
-            )
+            x_random_pad = tf.random.uniform(
+                tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype
+            ) * (1 - mask)
             # shuffle over batch dim
             x_shuffle = tf.random.shuffle(x * mask + x_random_pad)
 

From c752174e3e06cde6c5964880e58fe57b6582292a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 30 Jan 2020 13:33:45 +0100
Subject: [PATCH 244/633] reorder

---
 rasa/utils/tensorflow/tf_layers.py | 42 +++++++++++++++---------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 841ae452efd6..b69be6f06927 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -26,27 +26,6 @@ def dropped_inputs():
         return outputs
 
 
-class DenseWithSparseWeights(tf.keras.layers.Dense):
-    def __init__(self, sparsity: int = 0.8, **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.sparsity = sparsity
-
-    def build(self, input_shape: tf.TensorShape) -> None:
-        super().build(input_shape)
-        # create random mask to set weights to 0
-        kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
-        kernel_mask = tf.cast(
-            tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
-        )
-        self.kernel_mask = tf.Variable(initial_value=kernel_mask, trainable=False)
-
-    def call(self, inputs: tf.Tensor) -> tf.Tensor:
-        if self.sparsity:
-            # set some weights to 0
-            self.kernel.assign(self.kernel * self.kernel_mask)
-        return super().call(inputs)
-
-
 class DenseForSparse(tf.keras.layers.Dense):
     """Dense layer for sparse input tensor"""
 
@@ -77,6 +56,27 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         return outputs
 
 
+class DenseWithSparseWeights(tf.keras.layers.Dense):
+    def __init__(self, sparsity: int = 0.8, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.sparsity = sparsity
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        super().build(input_shape)
+        # create random mask to set weights to 0
+        kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
+        kernel_mask = tf.cast(
+            tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
+        )
+        self.kernel_mask = tf.Variable(initial_value=kernel_mask, trainable=False)
+
+    def call(self, inputs: tf.Tensor) -> tf.Tensor:
+        if self.sparsity:
+            # set some weights to 0
+            self.kernel.assign(self.kernel * self.kernel_mask)
+        return super().call(inputs)
+
+
 class Ffnn(tf.keras.layers.Layer):
     """Create feed-forward network with hidden layers and name suffix."""
 

From 10cda06d10dcee4aa7661d7dba40c3031d41d441 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 30 Jan 2020 13:35:29 +0100
Subject: [PATCH 245/633] remove if

---
 rasa/utils/tensorflow/tf_layers.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index b69be6f06927..a6480462f784 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -71,9 +71,8 @@ def build(self, input_shape: tf.TensorShape) -> None:
         self.kernel_mask = tf.Variable(initial_value=kernel_mask, trainable=False)
 
     def call(self, inputs: tf.Tensor) -> tf.Tensor:
-        if self.sparsity:
-            # set some weights to 0
-            self.kernel.assign(self.kernel * self.kernel_mask)
+        # set some weights to 0 according to precomputed mask
+        self.kernel.assign(self.kernel * self.kernel_mask)
         return super().call(inputs)
 
 

From e6a1f8ba2e451918954291a83e7964c7142dbad1 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:49:28 +0100
Subject: [PATCH 246/633] Update changelog/663.feature.rst

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 changelog/663.feature.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/changelog/663.feature.rst b/changelog/663.feature.rst
index 5a08ea5b69bd..e97c764566bc 100644
--- a/changelog/663.feature.rst
+++ b/changelog/663.feature.rst
@@ -2,4 +2,4 @@ The document vector of the ``SpacyFeaturizer`` can be calculated using max or me
 
 To specify the pooling operation, set the option ``pooling`` for the ``SpacyFeaturizer`` in your configuration file.
 The default pooling operation is ``mean``.
-The mean pooling operation also does not take words into account anymore, that do not have a word vector.
\ No newline at end of file
+The mean pooling operation also does not take into account words, that do not have a word vector.

From 46a36b10ea1d1e0e4a7d809f3587c05a9888747d Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:49:42 +0100
Subject: [PATCH 247/633] Update docs/nlu/components.rst

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 docs/nlu/components.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 14ee9f951265..e1245e145325 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -132,7 +132,7 @@ SpacyFeaturizer
 :Description:
     Creates feature for intent classification using the spacy featurizer.
 :Configuration:
-    The document vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
+    The sentence vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
     mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
     The default pooling method is set to ``mean``.
 

From a341d6cd54009b48adabad2d8e2288679105bf3b Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:49:54 +0100
Subject: [PATCH 248/633] Update
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index f74926a0b754..b8efbb83dda3 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -64,7 +64,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def _calculate_cls_vector(self, features: np.ndarray) -> np.ndarray:
         # take only non zeros feature vectors into account
-        features = np.array([f for f in features if f.any()])
+        non_zero_features = np.array([f for f in features if f.any()])
 
         if self.pooling_operation == "mean":
             return np.mean(features, axis=0, keepdims=True)

From 75e04b5ec3cb1ac6925152f491db074a391a9378 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:50:03 +0100
Subject: [PATCH 249/633] Update
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index b8efbb83dda3..7985741fbe4d 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -67,7 +67,7 @@ def _calculate_cls_vector(self, features: np.ndarray) -> np.ndarray:
         non_zero_features = np.array([f for f in features if f.any()])
 
         if self.pooling_operation == "mean":
-            return np.mean(features, axis=0, keepdims=True)
+            return np.mean(non_zero_features, axis=0, keepdims=True)
         elif self.pooling_operation == "max":
             return np.max(features, axis=0, keepdims=True)
         else:

From ba4b8f70ffbb7bb4d3429c3b6aaa1f9fbcc3f632 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:50:09 +0100
Subject: [PATCH 250/633] Update
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 7985741fbe4d..a1c48b38bcd9 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -69,7 +69,7 @@ def _calculate_cls_vector(self, features: np.ndarray) -> np.ndarray:
         if self.pooling_operation == "mean":
             return np.mean(non_zero_features, axis=0, keepdims=True)
         elif self.pooling_operation == "max":
-            return np.max(features, axis=0, keepdims=True)
+            return np.max(non_zero_features, axis=0, keepdims=True)
         else:
             raise ValueError(
                 f"Invalid pooling operation specified. Available operations are "

From 8ea98245b1e86fb39410a179f832c75cdfd69ced Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 13:56:47 +0100
Subject: [PATCH 251/633] pooling operation for mitie featurizer.

---
 changelog/663.feature.rst                     |  8 +++----
 docs/nlu/components.rst                       |  7 +++++-
 .../dense_featurizer/mitie_featurizer.py      | 15 ++++++++++--
 .../dense_featurizer/spacy_featurizer.py      | 16 +------------
 rasa/nlu/featurizers/featurizer.py            | 16 +++++++++++++
 tests/nlu/featurizers/test_featurizer.py      | 23 +++++++++++++++++++
 .../nlu/featurizers/test_spacy_featurizer.py  | 23 -------------------
 7 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/changelog/663.feature.rst b/changelog/663.feature.rst
index e97c764566bc..064d2e25398f 100644
--- a/changelog/663.feature.rst
+++ b/changelog/663.feature.rst
@@ -1,5 +1,5 @@
-The document vector of the ``SpacyFeaturizer`` can be calculated using max or mean pooling.
+The sentence vector of the ``SpacyFeaturizer`` and ``MitieFeaturizer`` can be calculated using max or mean pooling.
 
-To specify the pooling operation, set the option ``pooling`` for the ``SpacyFeaturizer`` in your configuration file.
-The default pooling operation is ``mean``.
-The mean pooling operation also does not take into account words, that do not have a word vector.
+To specify the pooling operation, set the option ``pooling`` for the ``SpacyFeaturizer`` or the ``MitieFeaturizer``
+in your configuration file. The default pooling operation is ``mean``. The mean pooling operation also does not take
+into account words, that do not have a word vector.
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index e1245e145325..b19599451927 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -114,12 +114,17 @@ MitieFeaturizer
         to use precomputed features.
 
 :Configuration:
+    The sentence vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
+    mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
+    The default pooling method is set to ``mean``.
 
     .. code-block:: yaml
 
         pipeline:
         - name: "MitieFeaturizer"
-
+          # Specify what pooling operation should be used to calculate the vector of
+          # the CLS token. Available options: 'mean' and 'max'
+          "pooling": "mean"
 
 
 SpacyFeaturizer
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index a1b4f8cafbe1..6aeccac04625 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any, List, Text, Optional
+from typing import Any, List, Text, Optional, Dict
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
@@ -27,6 +27,17 @@ class MitieFeaturizer(Featurizer):
         "mitie_feature_extractor"
     ]
 
+    defaults = {
+        # Specify what pooling operation should be used to calculate the vector of
+        # the CLS token. Available options: 'mean' and 'max'
+        "pooling": "mean"
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+        super().__init__(component_config)
+
+        self.pooling_operation = self.component_config["pooling"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie", "numpy"]
@@ -107,7 +118,7 @@ def features_for_tokens(
             features.append(feature_extractor.get_feature_vector(token.text))
         features = np.array(features)
 
-        cls_token_vec = np.mean(features, axis=0, keepdims=True)
+        cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation)
         features = np.concatenate([features, cls_token_vec])
 
         return features
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index a1c48b38bcd9..53acff7937a0 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -62,20 +62,6 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
 
-    def _calculate_cls_vector(self, features: np.ndarray) -> np.ndarray:
-        # take only non zeros feature vectors into account
-        non_zero_features = np.array([f for f in features if f.any()])
-
-        if self.pooling_operation == "mean":
-            return np.mean(non_zero_features, axis=0, keepdims=True)
-        elif self.pooling_operation == "max":
-            return np.max(non_zero_features, axis=0, keepdims=True)
-        else:
-            raise ValueError(
-                f"Invalid pooling operation specified. Available operations are "
-                f"'mean' or 'max', but provided value is '{self.pooling_operation}'."
-            )
-
     def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
 
@@ -84,7 +70,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE
         if message_attribute_doc is not None:
             features = self._features_for_doc(message_attribute_doc)
 
-            cls_token_vec = self._calculate_cls_vector(features)
+            cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation)
             features = np.concatenate([features, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index 63f28465dc58..aa9e47a331a5 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -68,3 +68,19 @@ def _combine_with_existing_sparse_features(
             return hstack([message.get(feature_name), additional_features])
         else:
             return additional_features
+
+    def _calculate_cls_vector(
+        self, features: np.ndarray, pooling_operation: Text
+    ) -> np.ndarray:
+        # take only non zeros feature vectors into account
+        non_zero_features = np.array([f for f in features if f.any()])
+
+        if pooling_operation == "mean":
+            return np.mean(non_zero_features, axis=0, keepdims=True)
+        elif pooling_operation == "max":
+            return np.max(non_zero_features, axis=0, keepdims=True)
+        else:
+            raise ValueError(
+                f"Invalid pooling operation specified. Available operations are "
+                f"'mean' or 'max', but provided value is '{pooling_operation}'."
+            )
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 666dcdd4ba98..04989097aa9f 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -101,3 +101,26 @@ def test_sequence_to_sentence_features(features, expected):
         assert np.all(expected.toarray() == actual.toarray())
     else:
         assert np.all(expected == actual)
+
+
+@pytest.mark.parametrize(
+    "pooling, features, expected",
+    [
+        (
+            "mean",
+            np.array([[0.5, 3, 0.4, 0.1], [0, 0, 0, 0], [0.5, 3, 0.4, 0.1]]),
+            np.array([[0.5, 3, 0.4, 0.1]]),
+        ),
+        (
+            "max",
+            np.array([[1.0, 3.0, 0.0, 2.0], [4.0, 3.0, 1.0, 0.0]]),
+            np.array([[4.0, 3.0, 1.0, 2.0]]),
+        ),
+    ],
+)
+def test_calculate_cls_vector(pooling, features, expected):
+    featurizer = Featurizer()
+
+    actual = featurizer._calculate_cls_vector(features, pooling)
+
+    assert np.all(actual == expected)
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index 95e7b9021865..e13acd4a0312 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -171,26 +171,3 @@ def test_spacy_featurizer_train(spacy_nlp):
     vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
 
     assert vecs is None
-
-
-@pytest.mark.parametrize(
-    "pooling, features, expected",
-    [
-        (
-            "mean",
-            np.array([[0.5, 3, 0.4, 0.1], [0, 0, 0, 0], [0.5, 3, 0.4, 0.1]]),
-            np.array([[0.5, 3, 0.4, 0.1]]),
-        ),
-        (
-            "max",
-            np.array([[1.0, 3.0, 0.0, 2.0], [4.0, 3.0, 1.0, 0.0]]),
-            np.array([[4.0, 3.0, 1.0, 2.0]]),
-        ),
-    ],
-)
-def test_calculate_cls_vector(pooling, features, expected):
-    featurizer = SpacyFeaturizer.create({"pooling": pooling}, RasaNLUModelConfig())
-
-    actual = featurizer._calculate_cls_vector(features)
-
-    assert np.all(actual == expected)

From 899a69f4084fdad3f232a96dd218b91b6d75fd32 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 14:19:45 +0100
Subject: [PATCH 252/633] Update docs/nlu/components.rst

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 docs/nlu/components.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 521b5d0fb96f..0a86e0c2f537 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -455,7 +455,7 @@ SklearnIntentClassifier
 DIETClassifier
 ~~~~~~~~~~~~~~
 
-:Short: Embedding intent classifier
+:Short: Dual Intent Entity Transformer used for intent classification and entity extraction
 :Outputs: ``intent`` and ``intent_ranking``
 :Requires: A featurizer
 :Output-Example:

From 2cb6e0a13c7acc706e440903f007e2f9c7450ce0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 14:22:59 +0100
Subject: [PATCH 253/633] update docs

---
 docs/core/old-core-change-log.rst | 2 +-
 docs/core/policies.rst            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/core/old-core-change-log.rst b/docs/core/old-core-change-log.rst
index bda2dc5a065f..2ee3a48b5442 100644
--- a/docs/core/old-core-change-log.rst
+++ b/docs/core/old-core-change-log.rst
@@ -555,7 +555,7 @@ Added
 - intent confidence support in RegexInterpreter
 - added paramter to train script to pull training data from an url instead
   of a stories file
-- added new policy: :ref:`embedding_policy` implemented in tensorflow
+- added new policy: :ref:`ted_policy` implemented in tensorflow
 
 Changed
 -------
diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index d4985c6a3e86..4d44423bbc13 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -190,7 +190,7 @@ In order to get reproducible training results for the same inputs you can
 set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 
 
-.. _embedding_policy:
+.. _ted_policy:
 
 TED Policy
 ^^^^^^^^^^

From dcee85767692ccdc3dcba836a9b491e31162ccc7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 14:24:55 +0100
Subject: [PATCH 254/633] review comments

---
 rasa/nlu/classifiers/DIET_classifier.py | 2 +-
 rasa/utils/tensorflow/tf_models.py      | 4 ++--
 tests/core/test_policies.py             | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index bf5126d46c06..6127f533188d 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -572,7 +572,7 @@ def train(
             self.component_config[BATCH_SIZES],
             self.component_config[EVAL_NUM_EXAMPLES],
             self.component_config[EVAL_NUM_EPOCHS],
-            batch_strategy=self.component_config[BATCH_STRATEGY],
+            self.component_config[BATCH_STRATEGY],
         )
 
     # process helpers
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 3761c660e086..f1ad5694ae35 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -17,8 +17,8 @@ class RasaModel(tf.keras.models.Model):
     Cannot be used as tf.keras.Model
     """
 
-    def __init__(self, random_seed: Optional[int], *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, random_seed: Optional[int] = None, **kwargs):
+        super().__init__(**kwargs)
 
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")
         self.metrics_to_log = ["t_loss"]
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index e9a3bda513c0..62d14ed46fc3 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -229,6 +229,7 @@ def create_policy(self, featurizer, priority, **kwargs):
         p = SklearnPolicy(featurizer, priority, **kwargs)
         return p
 
+    # TODO fix and test tf config
     @pytest.yield_fixture
     def mock_search(self):
         with patch("rasa.core.policies.sklearn_policy.GridSearchCV") as gs:

From f66644fd10b43f30cca8c67f4dd2a507398973d7 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Thu, 30 Jan 2020 14:28:41 +0100
Subject: [PATCH 255/633] Update rasa/utils/tensorflow/tf_layers.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/utils/tensorflow/tf_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index a6480462f784..d945ca43a2ae 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -63,7 +63,7 @@ def __init__(self, sparsity: int = 0.8, **kwargs) -> None:
 
     def build(self, input_shape: tf.TensorShape) -> None:
         super().build(input_shape)
-        # create random mask to set weights to 0
+        # create random mask to set some weights to 0
         kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
         kernel_mask = tf.cast(
             tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype

From 8252ce3d48cbe392f5d2dba3be4e22374763aa20 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 14:58:36 +0100
Subject: [PATCH 256/633] set label attribute to none for only NER

---
 rasa/nlu/classifiers/DIET_classifier.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index 6127f533188d..0a1e1a9605bb 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -440,11 +440,7 @@ def _create_model_data(
         tag_ids = []
 
         for e in training_data:
-            if (
-                label_attribute is None
-                or label_attribute == INTENT_ATTRIBUTE
-                or e.get(label_attribute)
-            ):
+            if label_attribute is None or e.get(label_attribute):
                 _sparse, _dense = self._extract_and_add_features(e, TEXT_ATTRIBUTE)
                 if _sparse is not None:
                     X_sparse.append(_sparse)
@@ -513,11 +509,15 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         tag_id_dict = self._create_tag_id_dict(training_data)
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
+        label_attribute = (
+            INTENT_ATTRIBUTE if self.component_config[INTENT_CLASSIFICATION] else None
+        )
+
         model_data = self._create_model_data(
             training_data.training_examples,
             label_id_dict,
             tag_id_dict,
-            label_attribute=INTENT_ATTRIBUTE,
+            label_attribute=label_attribute,
         )
 
         self.num_tags = len(self.inverted_tag_dict)

From 8640795e2adc7374bdcf2199463b9abbf2cb4e4f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 15:05:18 +0100
Subject: [PATCH 257/633] skip tf config test

---
 tests/core/test_policies.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 62d14ed46fc3..f692535275a5 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -214,6 +214,7 @@ def create_policy(self, featurizer, priority):
         p = KerasPolicy(featurizer, priority, **tf_defaults())
         return p
 
+    # TODO fix and test tf config
     @pytest.mark.skip(reason="We need to fix tf.config!")
     def test_tf_config(self, trained_policy, tmpdir):
         # noinspection PyProtectedMember
@@ -229,7 +230,6 @@ def create_policy(self, featurizer, priority, **kwargs):
         p = SklearnPolicy(featurizer, priority, **kwargs)
         return p
 
-    # TODO fix and test tf config
     @pytest.yield_fixture
     def mock_search(self):
         with patch("rasa.core.policies.sklearn_policy.GridSearchCV") as gs:
@@ -512,19 +512,20 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-# TODO test tf config
-# class TestTEDPolicyWithTfConfig(TestTEDPolicy):
-#     def create_policy(self, featurizer, priority):
-#         p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
-#         return p
-#
-#     def test_tf_config(self, trained_policy, tmpdir):
-#         # noinspection PyProtectedMember
-#         assert trained_policy.session._config == session_config()
-#         trained_policy.persist(tmpdir.strpath)
-#         loaded = trained_policy.__class__.load(tmpdir.strpath)
-#         # noinspection PyProtectedMember
-#         assert loaded.session._config == session_config()
+class TestTEDPolicyWithTfConfig(TestTEDPolicy):
+    def create_policy(self, featurizer, priority):
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
+        return p
+
+    # TODO test tf config
+    @pytest.mark.skip(reason="Fix tf config.")
+    def test_tf_config(self, trained_policy, tmpdir):
+        # noinspection PyProtectedMember
+        assert trained_policy.session._config == session_config()
+        trained_policy.persist(tmpdir.strpath)
+        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        # noinspection PyProtectedMember
+        assert loaded.session._config == session_config()
 
 
 class TestMemoizationPolicy(PolicyTestCollection):

From 025ac1375b28e70119237e5b009380c7463c3963 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 15:11:23 +0100
Subject: [PATCH 258/633] rename similarity type method

---
 rasa/core/policies/embedding_policy.py              | 4 ++--
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 ++--
 rasa/utils/train_utils.py                           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 2a1764d2de40..f56c2f7ae848 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -170,7 +170,7 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
         self.config.update(kwargs)
 
-        self.config = train_utils.update_auto_similarity_type(self.config)
+        self.config = train_utils.update_similarity_type(self.config)
 
         if self.config[EVAL_NUM_EPOCHS] < 1:
             self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
@@ -401,7 +401,7 @@ def load(cls, path: Text) -> "EmbeddingPolicy":
         with open(os.path.join(path, file_name + ".meta.pkl"), "rb") as f:
             meta = pickle.load(f)
 
-        meta = train_utils.update_auto_similarity_type(meta)
+        meta = train_utils.update_similarity_type(meta)
 
         model = TED.load(
             tf_model_file,
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index baa5ea8b2a5a..dbba3ee7ae21 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -194,7 +194,7 @@ def _check_config_parameters(self) -> None:
                 "hidden_layer_sizes for text and label must coincide."
             )
 
-        self.component_config = train_utils.update_auto_similarity_type(
+        self.component_config = train_utils.update_similarity_type(
             self.component_config
         )
 
@@ -815,7 +815,7 @@ def load(
         ) as f:
             batch_tuple_sizes = pickle.load(f)
 
-        meta = train_utils.update_auto_similarity_type(meta)
+        meta = train_utils.update_similarity_type(meta)
 
         model = DIET.load(
             tf_model_file,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2ae6c4e84871..17ebcb124ab8 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -33,7 +33,7 @@ def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarr
     return new_values
 
 
-def update_auto_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
+def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     if config[SIMILARITY_TYPE] == "auto":
         if config[LOSS_TYPE] == "softmax":
             config[SIMILARITY_TYPE] = "inner"

From 32c46bfdfef53a1ddc162711bd58049802687ad9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 15:28:08 +0100
Subject: [PATCH 259/633] clean up signatures

---
 rasa/core/policies/registry.py                |  2 +-
 .../policies/{TED_policy.py => ted_policy.py} |  0
 ...{DIET_classifier.py => diet_classifier.py} |  2 --
 .../classifiers/keyword_intent_classifier.py  | 20 ++++++++-----------
 .../classifiers/mitie_intent_classifier.py    |  5 ++++-
 .../classifiers/sklearn_intent_classifier.py  |  5 ++++-
 .../nlu/extractors/duckling_http_extractor.py |  1 -
 rasa/nlu/featurizers/featurizer.py            |  3 +--
 .../count_vectors_featurizer.py               | 10 +++++-----
 .../lexical_syntactic_featurizer.py           |  6 ++----
 .../sparse_featurizer/regex_featurizer.py     |  6 ++----
 rasa/nlu/registry.py                          |  2 +-
 .../selectors/embedding_response_selector.py  |  2 +-
 tests/core/test_policies.py                   |  2 +-
 tests/nlu/classifiers/test_DIET_classifier.py |  2 +-
 15 files changed, 31 insertions(+), 37 deletions(-)
 rename rasa/core/policies/{TED_policy.py => ted_policy.py} (100%)
 rename rasa/nlu/classifiers/{DIET_classifier.py => diet_classifier.py} (99%)

diff --git a/rasa/core/policies/registry.py b/rasa/core/policies/registry.py
index 1c26d9ce21a5..350ed735742a 100644
--- a/rasa/core/policies/registry.py
+++ b/rasa/core/policies/registry.py
@@ -2,7 +2,7 @@
 # path. Don't do this in `__init__.py` to avoid importing them without need.
 
 # noinspection PyUnresolvedReferences
-from rasa.core.policies.TED_policy import TEDPolicy
+from rasa.core.policies.ted_policy import TEDPolicy
 
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.fallback import FallbackPolicy
diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/ted_policy.py
similarity index 100%
rename from rasa/core/policies/TED_policy.py
rename to rasa/core/policies/ted_policy.py
diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
similarity index 99%
rename from rasa/nlu/classifiers/DIET_classifier.py
rename to rasa/nlu/classifiers/diet_classifier.py
index 8c1e0472f471..8c6e891726a2 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -672,8 +672,6 @@ def _predict_entities(
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
 
-        print(tags)
-
         if self.component_config[BILOU_FLAG]:
             tags = bilou_utils.remove_bilou_prefixes(tags)
 
diff --git a/rasa/nlu/classifiers/keyword_intent_classifier.py b/rasa/nlu/classifiers/keyword_intent_classifier.py
index b1363d07ff86..eea75e464442 100644
--- a/rasa/nlu/classifiers/keyword_intent_classifier.py
+++ b/rasa/nlu/classifiers/keyword_intent_classifier.py
@@ -1,5 +1,4 @@
 import os
-import warnings
 import logging
 import typing
 import re
@@ -8,18 +7,15 @@
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu import utils
 from rasa.nlu.components import Component
-from rasa.nlu.training_data import Message
 from rasa.nlu.constants import INTENT_ATTRIBUTE
 from rasa.utils.common import raise_warning
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.config import RasaNLUModelConfig
-    from rasa.nlu.training_data import TrainingData
-    from rasa.nlu.model import Metadata
-    from rasa.nlu.training_data import Message
-
 
 class KeywordIntentClassifier(Component):
     """Intent classifier using simple keyword matching.
@@ -47,8 +43,8 @@ def __init__(
 
     def train(
         self,
-        training_data: "TrainingData",
-        cfg: Optional["RasaNLUModelConfig"] = None,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
@@ -145,7 +141,7 @@ def load(
         cls,
         meta: Dict[Text, Any],
         model_dir: Optional[Text] = None,
-        model_metadata: "Metadata" = None,
+        model_metadata: Metadata = None,
         cached_component: Optional["KeywordIntentClassifier"] = None,
         **kwargs: Any,
     ) -> "KeywordIntentClassifier":
@@ -158,7 +154,7 @@ def load(
             else:
                 raise_warning(
                     f"Failed to load key word file for `IntentKeywordClassifier`, "
-                    f"maybe {keyword_file} does not exist?",
+                    f"maybe {keyword_file} does not exist?"
                 )
                 intent_keyword_map = None
             return cls(meta, intent_keyword_map)
diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index cf156980b30c..2fc76ab294cb 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -32,7 +32,10 @@ def required_packages(cls) -> List[Text]:
         return ["mitie"]
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         import mitie
 
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 70e6a6d0f276..bd6f6f424255 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -82,7 +82,10 @@ def transform_labels_num2str(self, y: np.ndarray) -> np.ndarray:
         return self.le.inverse_transform(y)
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         """Train the intent classifier on a data set."""
 
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
index 835994e1a3fe..ec57abfd4eb2 100644
--- a/rasa/nlu/extractors/duckling_http_extractor.py
+++ b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -1,7 +1,6 @@
 import time
 import json
 import logging
-import warnings
 import os
 import requests
 from typing import Any, List, Optional, Text, Dict
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index aa9e47a331a5..075ac1331ece 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -1,7 +1,6 @@
 import numpy as np
-import warnings
 import scipy.sparse
-from typing import Any, Text, Union, Optional, Dict
+from typing import Any, Text, Union, Optional
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
 from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT_ATTRIBUTE
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 4e6c547ef78f..d83de943a457 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -264,7 +264,7 @@ def _replace_with_oov_token(
         return tokens
 
     def _get_processed_message_tokens_by_attribute(
-        self, message: "Message", attribute: Text = TEXT_ATTRIBUTE
+        self, message: Message, attribute: Text = TEXT_ATTRIBUTE
     ) -> List[Text]:
         """Get processed text of attribute of a message"""
 
@@ -302,7 +302,7 @@ def _check_OOV_present(self, all_tokens: List[List[Text]]) -> None:
             )
 
     def _get_all_attributes_processed_tokens(
-        self, training_data: "TrainingData"
+        self, training_data: TrainingData
     ) -> Dict[Text, List[List[Text]]]:
         """Get processed text for all attributes of examples in training data"""
 
@@ -444,7 +444,7 @@ def _get_featurized_attribute(
             return None
 
     def _set_attribute_features(
-        self, attribute: Text, attribute_features: List, training_data: "TrainingData"
+        self, attribute: Text, attribute_features: List, training_data: TrainingData
     ) -> None:
         """Set computed features of the attribute to corresponding message objects"""
         for i, example in enumerate(training_data.training_examples):
@@ -572,7 +572,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
     @classmethod
     def _create_shared_vocab_vectorizers(
         cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
-    ) -> Dict[Text, "CountVectorizer"]:
+    ) -> Dict[Text, CountVectorizer]:
         """Create vectorizers for all attributes with shared vocabulary"""
 
         shared_vectorizer = CountVectorizer(
@@ -598,7 +598,7 @@ def _create_shared_vocab_vectorizers(
     @classmethod
     def _create_independent_vocab_vectorizers(
         cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
-    ) -> Dict[Text, "CountVectorizer"]:
+    ) -> Dict[Text, CountVectorizer]:
         """Create vectorizers for all attributes with independent vocabulary"""
 
         attribute_vectorizers = {}
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 011a1785046d..261d1fa01209 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -13,12 +13,10 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
+from rasa.nlu.model import Metadata
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.model import Metadata
-
 
 class LexicalSyntacticFeaturizer(Featurizer):
 
@@ -269,7 +267,7 @@ def load(
         cls,
         meta: Dict[Text, Any],
         model_dir: Optional[Text] = None,
-        model_metadata: Optional["Metadata"] = None,
+        model_metadata: Optional[Metadata] = None,
         cached_component: Optional["LexicalSyntacticFeaturizer"] = None,
         **kwargs: Any,
     ) -> "LexicalSyntacticFeaturizer":
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index f181f841c43b..2f11cce22487 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -22,12 +22,10 @@
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
+from rasa.nlu.model import Metadata
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.model import Metadata
-
 
 class RegexFeaturizer(Featurizer):
 
@@ -168,7 +166,7 @@ def load(
         cls,
         meta: Dict[Text, Any],
         model_dir: Optional[Text] = None,
-        model_metadata: Optional["Metadata"] = None,
+        model_metadata: Optional[Metadata] = None,
         cached_component: Optional["RegexFeaturizer"] = None,
         **kwargs: Any,
     ) -> "RegexFeaturizer":
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index ebf6840bd02b..3131a1598da3 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -10,7 +10,7 @@
 
 from rasa.constants import DOCS_URL_COMPONENTS
 
-from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
 from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 4c76ed6bfb49..1b6a04278a17 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, Text
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_TEXT,
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index f692535275a5..5d0a4a2c080b 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -24,7 +24,7 @@
     FullDialogueTrackerFeaturizer,
 )
 from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy
-from rasa.core.policies.TED_policy import TEDPolicy
+from rasa.core.policies.ted_policy import TEDPolicy
 from rasa.core.policies.fallback import FallbackPolicy
 from rasa.core.policies.form_policy import FormPolicy
 from rasa.core.policies.keras_policy import KerasPolicy
diff --git a/tests/nlu/classifiers/test_DIET_classifier.py b/tests/nlu/classifiers/test_DIET_classifier.py
index 64c4a66f5972..19633b5b9ab4 100644
--- a/tests/nlu/classifiers/test_DIET_classifier.py
+++ b/tests/nlu/classifiers/test_DIET_classifier.py
@@ -13,7 +13,7 @@
     INTENT_ATTRIBUTE,
 )
 from rasa.utils.tensorflow.constants import LOSS_TYPE, RANDOM_SEED, RANKING_LENGTH
-from rasa.nlu.classifiers.DIET_classifier import DIETClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.model import Interpreter
 from rasa.nlu.training_data import Message
 from rasa.utils import train_utils

From c333afa9fef7eec23ffeb493ff0720a8a9936f83 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 15:36:11 +0100
Subject: [PATCH 260/633] rename test file

---
 .../{test_DIET_classifier.py => test_diet_classifier.py}          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/nlu/classifiers/{test_DIET_classifier.py => test_diet_classifier.py} (100%)

diff --git a/tests/nlu/classifiers/test_DIET_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
similarity index 100%
rename from tests/nlu/classifiers/test_DIET_classifier.py
rename to tests/nlu/classifiers/test_diet_classifier.py

From 941e1be1144aa3f08fe8d182bb41cba73cf67753 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 15:49:27 +0100
Subject: [PATCH 261/633] treat edge case for spacy cls vector

---
 rasa/nlu/featurizers/featurizer.py       | 7 ++++++-
 tests/nlu/featurizers/test_featurizer.py | 9 ++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index aa9e47a331a5..caccdc1a5e8f 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -69,12 +69,17 @@ def _combine_with_existing_sparse_features(
         else:
             return additional_features
 
+    @staticmethod
     def _calculate_cls_vector(
-        self, features: np.ndarray, pooling_operation: Text
+        features: np.ndarray, pooling_operation: Text
     ) -> np.ndarray:
         # take only non zeros feature vectors into account
         non_zero_features = np.array([f for f in features if f.any()])
 
+        # if features are all zero just return a vector with all zeros
+        if non_zero_features.size == 0:
+            return np.zeros([1, features.shape[-1]])
+
         if pooling_operation == "mean":
             return np.mean(non_zero_features, axis=0, keepdims=True)
         elif pooling_operation == "max":
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 04989097aa9f..ee6af4f197f6 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -116,11 +116,14 @@ def test_sequence_to_sentence_features(features, expected):
             np.array([[1.0, 3.0, 0.0, 2.0], [4.0, 3.0, 1.0, 0.0]]),
             np.array([[4.0, 3.0, 1.0, 2.0]]),
         ),
+        (
+            "max",
+            np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]),
+            np.array([[0.0, 0.0, 0.0, 0.0]]),
+        ),
     ],
 )
 def test_calculate_cls_vector(pooling, features, expected):
-    featurizer = Featurizer()
-
-    actual = featurizer._calculate_cls_vector(features, pooling)
+    actual = Featurizer._calculate_cls_vector(features, pooling)
 
     assert np.all(actual == expected)

From fdfdd4c11baf5780fbd498732510fd88a666072b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 17:23:05 +0100
Subject: [PATCH 262/633] deprecate CRFEntityExtractor and
 EmbeddingIntentClassifier

---
 rasa/nlu/classifiers/diet_classifier.py       |  87 ++-
 .../embedding_intent_classifier.py            | 150 ++++
 rasa/nlu/extractors/crf_entity_extractor.py   | 680 +++---------------
 rasa/nlu/registry.py                          |   8 +-
 .../selectors/embedding_response_selector.py  |  24 +-
 rasa/utils/tensorflow/tf_model_data.py        |   4 +
 rasa/utils/tensorflow/tf_models.py            |   2 +-
 7 files changed, 305 insertions(+), 650 deletions(-)
 create mode 100644 rasa/nlu/classifiers/embedding_intent_classifier.py

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 8c6e891726a2..6c6ddab7dd71 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -40,7 +40,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    POS_ENCODING,
     MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
@@ -118,8 +117,6 @@ class DIETClassifier(EntityExtractor):
         NUM_TRANSFORMER_LAYERS: 2,
         # number of attention heads in transformer
         NUM_HEADS: 4,
-        # type of positional encoding in transformer
-        POS_ENCODING: "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
         MAX_SEQ_LENGTH: 256,
         # training parameters
@@ -167,6 +164,8 @@ class DIETClassifier(EntityExtractor):
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
+        # if true apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -180,28 +179,29 @@ class DIETClassifier(EntityExtractor):
         # if true random tokens of the input message will be masked and the model
         # should predict those tokens
         MASKED_LM: False,
-        # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: True,
-        # if true BILOU schema is used for entities
-        BILOU_FLAG: False,
+        # BILOU_flag determines whether to use BILOU tagging or not.
+        # More rigorous however requires more examples per entity
+        # rule of thumb: use only if more than 100 egs. per entity
+        BILOU_FLAG: True,
     }
     # end default properties (DOC MARKER - don't remove)
 
     # init helpers
     def _check_config_parameters(self) -> None:
-        if (
-            self.component_config[SHARE_HIDDEN_LAYERS]
-            and self.component_config[HIDDEN_LAYERS_SIZES_TEXT]
-            != self.component_config[HIDDEN_LAYERS_SIZES_LABEL]
-        ):
-            raise ValueError(
-                "If hidden layer weights are shared,"
-                "hidden_layer_sizes for text and label must coincide."
-            )
+        if self.component_config[INTENT_CLASSIFICATION]:
+            if (
+                self.component_config[SHARE_HIDDEN_LAYERS]
+                and self.component_config[HIDDEN_LAYERS_SIZES_TEXT]
+                != self.component_config[HIDDEN_LAYERS_SIZES_LABEL]
+            ):
+                raise ValueError(
+                    "If hidden layer weights are shared,"
+                    "hidden_layer_sizes for text and label must coincide."
+                )
 
-        self.component_config = train_utils.update_similarity_type(
-            self.component_config
-        )
+            self.component_config = train_utils.update_similarity_type(
+                self.component_config
+            )
 
         if self.component_config[EVAL_NUM_EPOCHS] < 1:
             self.component_config[EVAL_NUM_EPOCHS] = self.component_config[EPOCHS]
@@ -242,6 +242,10 @@ def __init__(
 
         self.data_example = None
 
+        self.label_key = (
+            "label_ids" if self.component_config[INTENT_CLASSIFICATION] else "tag_ids"
+        )
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -483,7 +487,7 @@ def _create_model_data(
         label_ids = np.array(label_ids)
         tag_ids = np.array(tag_ids)
 
-        model_data = RasaModelData(label_key="label_ids")
+        model_data = RasaModelData(label_key=self.label_key)
         model_data.add_features("text_features", [X_sparse, X_dense])
         model_data.add_features("label_features", [Y_sparse, Y_dense])
         if label_attribute and model_data.feature_not_exists("label_features"):
@@ -789,10 +793,10 @@ def load(
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
+        label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else "tag_ids"
+
         with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "rb") as f:
-            model_data_example = RasaModelData(
-                label_key="label_ids", data=pickle.load(f)
-            )
+            model_data_example = RasaModelData(label_key=label_key, data=pickle.load(f))
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
             label_data = pickle.load(f)
@@ -822,7 +826,7 @@ def load(
         )
         # build the graph for prediction
         predict_data_example = RasaModelData(
-            label_key="label_ids",
+            label_key=label_key,
             data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
         model.build_for_predict(predict_data_example)
@@ -895,9 +899,12 @@ def _update_metrics_to_log(self) -> None:
 
     def _prepare_layers(self) -> None:
         self._prepare_sequence_layers()
-        self._prepare_mask_lm_layers()
-        self._prepare_intent_classification_layers()
-        self._prepare_entity_recognition_layers()
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers()
+        if self.config[INTENT_CLASSIFICATION]:
+            self._prepare_intent_classification_layers()
+        if self.config[ENTITY_RECOGNITION]:
+            self._prepare_entity_recognition_layers()
 
     @staticmethod
     def _create_sparse_dense_layer(
@@ -931,24 +938,26 @@ def _prepare_sequence_layers(self) -> None:
             self.config[C2],
             self.config[DENSE_DIM]["text"],
         )
-        self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
-            self.data_signature["label_features"],
-            "label",
-            self.config[C2],
-            self.config[DENSE_DIM]["label"],
-        )
+        if self.config[INTENT_CLASSIFICATION]:
+            self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
+                self.data_signature["label_features"],
+                "label",
+                self.config[C2],
+                self.config[DENSE_DIM]["label"],
+            )
         self._tf_layers["ffnn.text"] = tf_layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_TEXT],
             self.config[DROPRATE],
             self.config[C2],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
         )
-        self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
-            self.config[HIDDEN_LAYERS_SIZES_LABEL],
-            self.config[DROPRATE],
-            self.config[C2],
-            "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
-        )
+        if self.config[INTENT_CLASSIFICATION]:
+            self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
+                self.config[HIDDEN_LAYERS_SIZES_LABEL],
+                self.config[DROPRATE],
+                self.config[C2],
+                "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
+            )
         self._tf_layers["transformer"] = (
             tf_layers.TransformerEncoder(
                 self.config[NUM_TRANSFORMER_LAYERS],
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
new file mode 100644
index 000000000000..05a0260f5498
--- /dev/null
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -0,0 +1,150 @@
+import logging
+from typing import Any, Dict, Optional, Text
+
+from rasa.constants import DOCS_BASE_URL
+from rasa.nlu.components import any_of
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    TEXT_ATTRIBUTE,
+    ENTITIES_ATTRIBUTE,
+    DENSE_FEATURE_NAMES,
+    SPARSE_FEATURE_NAMES,
+)
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_TEXT,
+    HIDDEN_LAYERS_SIZES_LABEL,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIM,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROPRATE,
+    C_EMB,
+    C2,
+    SCALE_LOSS,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+)
+from utils.common import raise_warning
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingIntentClassifier(DIETClassifier):
+
+    provides = [ENTITIES_ATTRIBUTE]
+
+    requires = [
+        TOKENS_NAMES[TEXT_ATTRIBUTE],
+        any_of(
+            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+        ),
+    ]
+
+    # default properties (DOC MARKER - don't remove)
+    defaults = {
+        # nn architecture
+        # sizes of hidden layers before the embedding layer for input words
+        # the number of hidden layers is thus equal to the length of this list
+        HIDDEN_LAYERS_SIZES_TEXT: [],
+        # sizes of hidden layers before the embedding layer for intent labels
+        # the number of hidden layers is thus equal to the length of this list
+        HIDDEN_LAYERS_SIZES_LABEL: [],
+        # Whether to share the hidden layer weights between input words and labels
+        SHARE_HIDDEN_LAYERS: False,
+        # number of units in transformer
+        TRANSFORMER_SIZE: 256,
+        # number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 2,
+        # number of attention heads in transformer
+        NUM_HEADS: 4,
+        # max sequence length if pos_encoding='emb'
+        MAX_SEQ_LENGTH: 256,
+        # training parameters
+        # initial and final batch sizes - batch size will be
+        # linearly increased for each epoch
+        BATCH_SIZES: [64, 256],
+        # how to create batches
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        # number of epochs
+        EPOCHS: 300,
+        # set random seed to any int to get reproducible results
+        RANDOM_SEED: None,
+        # optimizer
+        LEARNING_RATE: 0.001,
+        # embedding parameters
+        # default dense dimension used if no dense features are present
+        DENSE_DIM: {"text": 512, "label": 20},
+        # dimension size of embedding vectors
+        EMBED_DIM: 20,
+        # the type of the similarity
+        NUM_NEG: 20,
+        # flag if minimize only maximum similarity over incorrect actions
+        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
+        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
+        # number of top intents to normalize scores for softmax loss_type
+        # set to 0 to turn off normalization
+        RANKING_LENGTH: 10,
+        # how similar the algorithm should try
+        # to make embedding vectors for correct labels
+        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        # maximum negative similarity for incorrect labels
+        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        # flag: if true, only minimize the maximum similarity for incorrect labels
+        USE_MAX_SIM_NEG: True,
+        # scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # regularization parameters
+        # the scale of L2 regularization
+        C2: 0.002,
+        # the scale of how critical the algorithm should be of minimizing the
+        # maximum similarity between embeddings of different labels
+        C_EMB: 0.8,
+        # dropout rate for rnn
+        DROPRATE: 0.2,
+        # use a unidirectional or bidirectional encoder
+        UNIDIRECTIONAL_ENCODER: False,
+        # if true apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: True,
+        # visualization of accuracy
+        # how often to calculate training accuracy
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
+        # how many examples to use for calculation of training accuracy
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+    }
+    # end default properties (DOC MARKER - don't remove)
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+
+        component_config[INTENT_CLASSIFICATION] = True
+        component_config[ENTITY_RECOGNITION] = False
+        component_config[MASKED_LM] = False
+
+        super().__init__(component_config)
+
+        raise_warning(
+            f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead ."
+            f" Check '{DOCS_BASE_URL}/nlu/components/' for more details.",
+            DeprecationWarning,
+        )
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 1548e1ea77ba..7a0e39ff5d7f 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -1,621 +1,111 @@
 import logging
-import warnings
-import os
-import typing
-import numpy as np
-from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
+from typing import Any, Dict, Optional, Text
 
-import rasa.nlu.utils.bilou_utils as bilou_utils
-from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
-from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.training_data import Message, TrainingData
+from rasa.constants import DOCS_BASE_URL
+from rasa.nlu.components import any_of
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import (
-    TOKENS_NAMES,
     TEXT_ATTRIBUTE,
-    DENSE_FEATURE_NAMES,
-    SPACY_DOCS,
     ENTITIES_ATTRIBUTE,
+    DENSE_FEATURE_NAMES,
+    SPARSE_FEATURE_NAMES,
 )
-from rasa.constants import (
-    DOCS_BASE_URL,
-    DOCS_URL_TRAINING_DATA_NLU,
-    DOCS_URL_COMPONENTS,
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_TEXT,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIM,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROPRATE,
+    C2,
+    BILOU_FLAG,
 )
-from rasa.utils.common import raise_warning
-
-try:
-    import spacy
-except ImportError:
-    spacy = None
+from utils.common import raise_warning
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from sklearn_crfsuite import CRF
-    from spacy.tokens import Doc
-
-
-class CRFToken(NamedTuple):
-    text: Text
-    tag: Text
-    entity: Text
-    pattern: Dict[Text, Any]
-    dense_features: np.ndarray
-
 
-class CRFEntityExtractor(EntityExtractor):
+class CRFEntityExtractor(DIETClassifier):
 
     provides = [ENTITIES_ATTRIBUTE]
 
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    requires = [
+        any_of(
+            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+        )
+    ]
 
+    # default properties (DOC MARKER - don't remove)
     defaults = {
+        # nn architecture
+        # sizes of hidden layers before the embedding layer for input words
+        # the number of hidden layers is thus equal to the length of this list
+        HIDDEN_LAYERS_SIZES_TEXT: [256, 128],
+        # training parameters
+        # initial and final batch sizes - batch size will be
+        # linearly increased for each epoch
+        BATCH_SIZES: [64, 256],
+        # how to create batches
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        # number of epochs
+        EPOCHS: 300,
+        # set random seed to any int to get reproducible results
+        RANDOM_SEED: None,
+        # optimizer
+        LEARNING_RATE: 0.001,
+        # embedding parameters
+        # default dense dimension used if no dense features are present
+        DENSE_DIM: {"text": 512, "label": 20},
+        # regularization parameters
+        # the scale of L2 regularization
+        C2: 0.002,
+        # dropout rate for rnn
+        DROPRATE: 0.2,
+        # if true apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: False,
+        # visualization of accuracy
+        # how often to calculate training accuracy
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
+        # how many examples to use for calculation of training accuracy
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
-        "BILOU_flag": True,
-        # crf_features is [before, word, after] array with before, word,
-        # after holding keys about which
-        # features to use for each word, for example, 'title' in
-        # array before will have the feature
-        # "is the preceding word in title case?"
-        # POS features require spaCy to be installed
-        "features": [
-            ["low", "title", "upper"],
-            [
-                "bias",
-                "low",
-                "prefix5",
-                "prefix2",
-                "suffix5",
-                "suffix3",
-                "suffix2",
-                "upper",
-                "title",
-                "digit",
-                "pattern",
-            ],
-            ["low", "title", "upper"],
-        ],
-        # The maximum number of iterations for optimization algorithms.
-        "max_iterations": 50,
-        # weight of theL1 regularization
-        "L1_c": 0.1,
-        # weight of the L2 regularization
-        "L2_c": 0.1,
+        BILOU_FLAG: True,
     }
+    # end default properties (DOC MARKER - don't remove)
 
-    function_dict = {
-        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
-        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
-        "prefix5": lambda crf_token: crf_token.text[:5],
-        "prefix2": lambda crf_token: crf_token.text[:2],
-        "suffix5": lambda crf_token: crf_token.text[-5:],
-        "suffix3": lambda crf_token: crf_token.text[-3:],
-        "suffix2": lambda crf_token: crf_token.text[-2:],
-        "suffix1": lambda crf_token: crf_token.text[-1:],
-        "pos": lambda crf_token: crf_token.tag,
-        "pos2": lambda crf_token: crf_token.tag[:2],
-        "bias": lambda crf_token: "bias",
-        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
-        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
-        "pattern": lambda crf_token: crf_token.pattern,
-        "text_dense_features": lambda crf_token: crf_token.dense_features,
-    }
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        ent_tagger: Optional["CRF"] = None,
-    ) -> None:
+        component_config[INTENT_CLASSIFICATION] = False
+        component_config[ENTITY_RECOGNITION] = True
+        component_config[MASKED_LM] = False
+        component_config[TRANSFORMER_SIZE] = 128
+        component_config[NUM_TRANSFORMER_LAYERS] = 0
+        component_config[NUM_HEADS] = 4
+        component_config[SHARE_HIDDEN_LAYERS] = False
+        component_config[MAX_SEQ_LENGTH] = 256
+        component_config[UNIDIRECTIONAL_ENCODER] = True
 
         super().__init__(component_config)
 
-        self.ent_tagger = ent_tagger
-
-        self._validate_configuration()
-
-        self._check_pos_features_and_spacy()
-
-    def _check_pos_features_and_spacy(self) -> None:
-        import itertools
-
-        features = self.component_config.get("features", [])
-        fts = set(itertools.chain.from_iterable(features))
-        self.pos_features = "pos" in fts or "pos2" in fts
-        if self.pos_features:
-            self._check_spacy()
-
-    @staticmethod
-    def _check_spacy() -> None:
-        if spacy is None:
-            raise ImportError(
-                "Failed to import `spaCy`. "
-                "`spaCy` is required for POS features "
-                "See https://spacy.io/usage/ for installation"
-                "instructions."
-            )
-
-    def _validate_configuration(self) -> None:
-        if len(self.component_config.get("features", [])) % 2 != 1:
-            raise ValueError(
-                "Need an odd number of crf feature lists to have a center word."
-            )
-
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["sklearn_crfsuite", "sklearn"]
-
-    def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        # checks whether there is at least one
-        # example with an entity annotation
-        if training_data.entity_examples:
-            self._check_spacy_doc(training_data.training_examples[0])
-
-            # filter out pre-trained entity examples
-            filtered_entity_examples = self.filter_trainable_entities(
-                training_data.training_examples
-            )
-
-            # convert the dataset into features
-            # this will train on ALL examples, even the ones
-            # without annotations
-            dataset = self._create_dataset(filtered_entity_examples)
-
-            self._train_model(dataset)
-
-    def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
-        dataset = []
-
-        for example in examples:
-            entity_offsets = bilou_utils.map_message_entities(example)
-            dataset.append(self._from_json_to_crf(example, entity_offsets))
-
-        return dataset
-
-    def _check_spacy_doc(self, message: Message) -> None:
-        if self.pos_features and message.get(SPACY_DOCS[TEXT_ATTRIBUTE]) is None:
-            raise InvalidConfigError(
-                "Could not find `spacy_doc` attribute for "
-                "message {}\n"
-                "POS features require a pipeline component "
-                "that provides `spacy_doc` attributes, i.e. `SpacyNLP`. "
-                "See {}/nlu/choosing-a-pipeline/#pretrained-embeddings-spacy "
-                "for details".format(message.text, DOCS_BASE_URL)
-            )
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-
-        self._check_spacy_doc(message)
-
-        extracted = self.add_extractor_name(self.extract_entities(message))
-        message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
-        )
-
-    def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
-        """Take a sentence and return entities in json format"""
-
-        if self.ent_tagger is not None:
-            text_data = self._from_text_to_crf(message)
-            features = self._sentence_to_features(text_data)
-            ents = self.ent_tagger.predict_marginals_single(features)
-            return self._from_crf_to_json(message, ents)
-        else:
-            return []
-
-    def most_likely_entity(self, idx: int, entities: List[Any]) -> Tuple[Text, Any]:
-        if len(entities) > idx:
-            entity_probs = entities[idx]
-        else:
-            entity_probs = None
-        if entity_probs:
-            label = max(entity_probs, key=lambda key: entity_probs[key])
-            if self.component_config["BILOU_flag"]:
-                # if we are using bilou flags, we will combine the prob
-                # of the B, I, L and U tags for an entity (so if we have a
-                # score of 60% for `B-address` and 40% and 30%
-                # for `I-address`, we will return 70%)
-                return (
-                    label,
-                    sum([v for k, v in entity_probs.items() if k[2:] == label[2:]]),
-                )
-            else:
-                return label, entity_probs[label]
-        else:
-            return "", 0.0
-
-    def _create_entity_dict(
-        self,
-        message: Message,
-        tokens: Union["Doc", List[Token]],
-        start: int,
-        end: int,
-        entity: str,
-        confidence: float,
-    ) -> Dict[Text, Any]:
-        if isinstance(tokens, list):  # tokens is a list of Token
-            _start = tokens[start].start
-            _end = tokens[end].end
-            value = tokens[start].text
-            value += "".join(
-                [
-                    message.text[tokens[i - 1].end : tokens[i].start] + tokens[i].text
-                    for i in range(start + 1, end + 1)
-                ]
-            )
-        else:  # tokens is a Doc
-            _start = tokens[start].idx
-            _end = tokens[start : end + 1].end_char
-            value = tokens[start : end + 1].text
-
-        return {
-            "start": _start,
-            "end": _end,
-            "value": value,
-            "entity": entity,
-            "confidence": confidence,
-        }
-
-    @staticmethod
-    def _tokens_without_cls(message: Message) -> List[Token]:
-        # [:-1] to remove the CLS token from the list of tokens
-        return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
-
-    def _find_bilou_end(self, word_idx, entities) -> Any:
-        ent_word_idx = word_idx + 1
-        finished = False
-
-        # get information about the first word, tagged with `B-...`
-        label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = bilou_utils.entity_name_from_tag(label)
-
-        while not finished:
-            label, label_confidence = self.most_likely_entity(ent_word_idx, entities)
-
-            confidence = min(confidence, label_confidence)
-
-            if label[2:] != entity_label:
-                # words are not tagged the same entity class
-                logger.debug(
-                    "Inconsistent BILOU tagging found, B- tag, L- "
-                    "tag pair encloses multiple entity classes.i.e. "
-                    "[B-a, I-b, L-a] instead of [B-a, I-a, L-a].\n"
-                    "Assuming B- class is correct."
-                )
-
-            if label.startswith("L-"):
-                # end of the entity
-                finished = True
-            elif label.startswith("I-"):
-                # middle part of the entity
-                ent_word_idx += 1
-            else:
-                # entity not closed by an L- tag
-                finished = True
-                ent_word_idx -= 1
-                logger.debug(
-                    "Inconsistent BILOU tagging found, B- tag not "
-                    "closed by L- tag, i.e [B-a, I-a, O] instead of "
-                    "[B-a, L-a, O].\nAssuming last tag is L-"
-                )
-        return ent_word_idx, confidence
-
-    def _handle_bilou_label(
-        self, word_idx: int, entities: List[Any]
-    ) -> Tuple[Any, Any, Any]:
-        label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = bilou_utils.entity_name_from_tag(label)
-
-        if bilou_utils.bilou_prefix_from_tag(label) == "U":
-            return word_idx, confidence, entity_label
-
-        elif bilou_utils.bilou_prefix_from_tag(label) == "B":
-            # start of multi word-entity need to represent whole extent
-            ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
-            return ent_word_idx, confidence, entity_label
-
-        else:
-            return None, None, None
-
-    def _from_crf_to_json(
-        self, message: Message, entities: List[Any]
-    ) -> List[Dict[Text, Any]]:
-
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-        else:
-            tokens = self._tokens_without_cls(message)
-
-        if len(tokens) != len(entities):
-            raise Exception(
-                "Inconsistency in amount of tokens between crfsuite and message"
-            )
-
-        if self.component_config["BILOU_flag"]:
-            return self._convert_bilou_tagging_to_entity_result(
-                message, tokens, entities
-            )
-        else:
-            # not using BILOU tagging scheme, multi-word entities are split.
-            return self._convert_simple_tagging_to_entity_result(tokens, entities)
-
-    def _convert_bilou_tagging_to_entity_result(
-        self, message: Message, tokens: List[Token], entities: List[Dict[Text, float]]
-    ):
-        # using the BILOU tagging scheme
-        json_ents = []
-        word_idx = 0
-        while word_idx < len(tokens):
-            end_idx, confidence, entity_label = self._handle_bilou_label(
-                word_idx, entities
-            )
-
-            if end_idx is not None:
-                ent = self._create_entity_dict(
-                    message, tokens, word_idx, end_idx, entity_label, confidence
-                )
-                json_ents.append(ent)
-                word_idx = end_idx + 1
-            else:
-                word_idx += 1
-        return json_ents
-
-    def _convert_simple_tagging_to_entity_result(
-        self, tokens: List[Union[Token, Any]], entities: List[Any]
-    ) -> List[Dict[Text, Any]]:
-        json_ents = []
-
-        for word_idx in range(len(tokens)):
-            entity_label, confidence = self.most_likely_entity(word_idx, entities)
-            word = tokens[word_idx]
-            if entity_label != "O":
-                if self.pos_features and not isinstance(word, Token):
-                    start = word.idx
-                    end = word.idx + len(word)
-                else:
-                    start = word.start
-                    end = word.end
-                ent = {
-                    "start": start,
-                    "end": end,
-                    "value": word.text,
-                    "entity": entity_label,
-                    "confidence": confidence,
-                }
-                json_ents.append(ent)
-
-        return json_ents
-
-    @classmethod
-    def load(
-        cls,
-        meta: Dict[Text, Any],
-        model_dir: Text = None,
-        model_metadata: Metadata = None,
-        cached_component: Optional["CRFEntityExtractor"] = None,
-        **kwargs: Any,
-    ) -> "CRFEntityExtractor":
-        from sklearn.externals import joblib
-
-        file_name = meta.get("file")
-        model_file = os.path.join(model_dir, file_name)
-
-        if os.path.exists(model_file):
-            ent_tagger = joblib.load(model_file)
-            return cls(meta, ent_tagger)
-        else:
-            return cls(meta)
-
-    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
-        """Persist this model into the passed directory.
-
-        Returns the metadata necessary to load the model again."""
-
-        from sklearn.externals import joblib
-
-        file_name = file_name + ".pkl"
-        if self.ent_tagger:
-            model_file_name = os.path.join(model_dir, file_name)
-            joblib.dump(self.ent_tagger, model_file_name)
-
-        return {"file": file_name}
-
-    def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any]]:
-        """Convert a word into discrete features in self.crf_features,
-        including word before and word after."""
-
-        configured_features = self.component_config["features"]
-        sentence_features = []
-
-        for word_idx in range(len(sentence)):
-            # word before(-1), current word(0), next word(+1)
-            feature_span = len(configured_features)
-            half_span = feature_span // 2
-            feature_range = range(-half_span, half_span + 1)
-            prefixes = [str(i) for i in feature_range]
-            word_features = {}
-            for f_i in feature_range:
-                if word_idx + f_i >= len(sentence):
-                    word_features["EOS"] = True
-                    # End Of Sentence
-                elif word_idx + f_i < 0:
-                    word_features["BOS"] = True
-                    # Beginning Of Sentence
-                else:
-                    word = sentence[word_idx + f_i]
-                    f_i_from_zero = f_i + half_span
-                    prefix = prefixes[f_i_from_zero]
-                    features = configured_features[f_i_from_zero]
-                    for feature in features:
-                        if feature == "pattern":
-                            # add all regexes as a feature
-                            regex_patterns = self.function_dict[feature](word)
-                            # pytype: disable=attribute-error
-                            for p_name, matched in regex_patterns.items():
-                                feature_name = prefix + ":" + feature + ":" + p_name
-                                word_features[feature_name] = matched
-                            # pytype: enable=attribute-error
-                        else:
-                            # append each feature to a feature vector
-                            value = self.function_dict[feature](word)
-                            word_features[prefix + ":" + feature] = value
-            sentence_features.append(word_features)
-        return sentence_features
-
-    @staticmethod
-    def _sentence_to_labels(
-        sentence: List[
-            Tuple[
-                Optional[Text],
-                Optional[Text],
-                Text,
-                Dict[Text, Any],
-                Optional[Dict[str, Any]],
-            ]
-        ],
-    ) -> List[Text]:
-
-        return [label for _, _, label, _, _ in sentence]
-
-    def _from_json_to_crf(
-        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
-    ) -> List[CRFToken]:
-        """Convert json examples to format of underlying crfsuite."""
-
-        if self.pos_features:
-            from spacy.gold import GoldParse  # pytype: disable=import-error
-
-            doc_or_tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
-            ents = [l[5] for l in gold.orig_annot]
-        else:
-            doc_or_tokens = self._tokens_without_cls(message)
-            ents = bilou_utils.bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
-
-        # collect badly annotated examples
-        collected = []
-        for t, e in zip(doc_or_tokens, ents):
-            if e == "-":
-                collected.append(t)
-            elif collected:
-                collected_text = " ".join([t.text for t in collected])
-                raise_warning(
-                    f"Misaligned entity annotation for '{collected_text}' "
-                    f"in sentence '{message.text}' with intent "
-                    f"'{message.get('intent')}'. "
-                    f"Make sure the start and end values of the "
-                    f"annotated training examples end at token "
-                    f"boundaries (e.g. don't include trailing "
-                    f"whitespaces or punctuation).",
-                    docs=DOCS_URL_TRAINING_DATA_NLU,
-                )
-                collected = []
-
-        if not self.component_config["BILOU_flag"]:
-            for i, label in enumerate(ents):
-                if bilou_utils.bilou_prefix_from_tag(label) in {"B", "I", "U", "L"}:
-                    # removes BILOU prefix from label
-                    ents[i] = bilou_utils.entity_name_from_tag(label)
-
-        return self._from_text_to_crf(message, ents)
-
-    @staticmethod
-    def __pattern_of_token(message: Message, i: int) -> Dict:
-        if message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
-            return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get("pattern", {})
-        else:
-            return {}
-
-    @staticmethod
-    def __tag_of_token(token: Any) -> Text:
-        if spacy.about.__version__ > "2" and token._.has("tag"):
-            return token._.get("tag")
-        else:
-            return token.tag_
-
-    @staticmethod
-    def __get_dense_features(message: Message) -> Optional[List[Any]]:
-        features = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
-
-        if features is None:
-            return None
-
-        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])
-        if len(tokens) != len(features):
-            raise_warning(
-                f"Number of features ({len(features)}) for attribute "
-                f"'{DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]}' "
-                f"does not match number of tokens ({len(tokens)}). Set "
-                f"'return_sequence' to true in the corresponding featurizer in order "
-                f"to make use of the features in 'CRFEntityExtractor'.",
-                docs=DOCS_URL_COMPONENTS + "#crfentityextractor",
-            )
-            return None
-
-        # convert to python-crfsuite feature format
-        features_out = []
-        for feature in features:
-            feature_dict = {
-                str(index): token_features
-                for index, token_features in enumerate(feature)
-            }
-            converted = {"text_dense_features": feature_dict}
-            features_out.append(converted)
-        return features_out
-
-    def _from_text_to_crf(
-        self, message: Message, entities: List[Text] = None
-    ) -> List[CRFToken]:
-        """Takes a sentence and switches it to crfsuite format."""
-
-        crf_format = []
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-        else:
-            tokens = self._tokens_without_cls(message)
-
-        text_dense_features = self.__get_dense_features(message)
-
-        for i, token in enumerate(tokens):
-            pattern = self.__pattern_of_token(message, i)
-            entity = entities[i] if entities else "N/A"
-            tag = self.__tag_of_token(token) if self.pos_features else None
-            dense_features = (
-                text_dense_features[i] if text_dense_features is not None else []
-            )
-
-            crf_format.append(
-                CRFToken(token.text, tag, entity, pattern, dense_features)
-            )
-
-        return crf_format
-
-    def _train_model(self, df_train: List[List[CRFToken]]) -> None:
-        """Train the crf tagger based on the training data."""
-        import sklearn_crfsuite
-
-        X_train = [self._sentence_to_features(sent) for sent in df_train]
-        y_train = [self._sentence_to_labels(sent) for sent in df_train]
-        self.ent_tagger = sklearn_crfsuite.CRF(
-            algorithm="lbfgs",
-            # coefficient for L1 penalty
-            c1=self.component_config["L1_c"],
-            # coefficient for L2 penalty
-            c2=self.component_config["L2_c"],
-            # stop earlier
-            max_iterations=self.component_config["max_iterations"],
-            # include transitions that are possible, but not observed
-            all_possible_transitions=True,
+        raise_warning(
+            f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in"
+            f"combination with the 'LexicalSyntacticFeaturizer'. Check "
+            f"Check '{DOCS_BASE_URL}/nlu/components/' for more details.",
+            DeprecationWarning,
         )
-        self.ent_tagger.fit(X_train, y_train)
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 3131a1598da3..256740ccd7bf 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -14,6 +14,7 @@
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
 from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 from rasa.nlu.extractors.duckling_http_extractor import DucklingHTTPExtractor
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
@@ -77,6 +78,7 @@
     MitieIntentClassifier,
     KeywordIntentClassifier,
     DIETClassifier,
+    EmbeddingIntentClassifier,
     # selectors
     ResponseSelector,
 ]
@@ -105,7 +107,7 @@
     "intent_classifier_sklearn": "SklearnIntentClassifier",
     "intent_classifier_mitie": "MitieIntentClassifier",
     "intent_classifier_keyword": "KeywordIntentClassifier",
-    "intent_classifier_tensorflow_embedding": "DIETClassifier",
+    "intent_classifier_tensorflow_embedding": "EmbeddingIntentClassifier",
 }
 
 # To simplify usage, there are a couple of model templates, that already add
@@ -134,12 +136,12 @@
             "min_ngram": 1,
             "max_ngram": 4,
         },
-        {"name": "DIETClassifier"},
+        {"name": "EmbeddingIntentClassifier"},
     ],
     "pretrained_embeddings_convert": [
         {"name": "ConveRTTokenizer"},
         {"name": "ConveRTFeaturizer"},
-        {"name": "DIETClassifier"},
+        {"name": "EmbeddingIntentClassifier"},
     ],
 }
 
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 1b6a04278a17..c201811591be 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, Text
+from typing import Any, Dict, Text, Optional
 
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -159,27 +159,27 @@ class ResponseSelector(DIETClassifier):
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
-        # if true intent classification is trained and intent predicted
-        INTENT_CLASSIFICATION: True,
-        # if true named entity recognition is trained and entities predicted
-        # (should always be false)
-        ENTITY_RECOGNITION: False,
-        # if true random tokens of the input message will be masked and the model
-        # should predict those tokens
-        MASKED_LM: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
     }
-
     # end default properties (DOC MARKER - don't remove)
 
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+        super().__init__(component_config)
+
+        # ResponseSelector should not be able to set the following properties
+        self.component_config[INTENT_CLASSIFICATION] = True
+        self.component_config[ENTITY_RECOGNITION] = False
+        self.component_config[MASKED_LM] = False
+
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
         if not self.retrieval_intent:
             # retrieval intent was left to its default value
             logger.info(
-                "Retrieval intent parameter was left to its default value. This response selector will be trained"
-                "on training examples combining all retrieval intents."
+                "Retrieval intent parameter was left to its default value. This "
+                "response selector will be trained on training examples combining "
+                "all retrieval intents."
             )
 
     def _check_config_parameters(self) -> None:
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index e593b6694d3e..bd0f87f09cb5 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -166,6 +166,10 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
         if self.label_key not in data or len(data[self.label_key]) > 1:
             raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
 
+        # skip balancing if labels are token based
+        if data[self.label_key][0].size > 2:
+            return data
+
         label_ids = self._create_label_ids(data[self.label_key][0])
 
         unique_label_ids, counts_label_ids = np.unique(
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index f1ad5694ae35..7fd885ee5449 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -48,7 +48,7 @@ def fit(
         evaluate_every_num_epochs: int,
         batch_strategy: Text,
         silent: bool = False,
-        eager: bool = False,
+        eager: bool = True,
     ) -> None:
         """Fit model data"""
 

From 0a07c61094db84075bf560ee070932db0175798b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 17:26:37 +0100
Subject: [PATCH 263/633] update default values

---
 rasa/nlu/classifiers/diet_classifier.py             | 4 ++--
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/nlu/extractors/crf_entity_extractor.py         | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 6c6ddab7dd71..260afa7b48ed 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -163,7 +163,7 @@ class DIETClassifier(EntityExtractor):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
+        UNIDIRECTIONAL_ENCODER: True,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -182,7 +182,7 @@ class DIETClassifier(EntityExtractor):
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
-        BILOU_FLAG: True,
+        BILOU_FLAG: False,
     }
     # end default properties (DOC MARKER - don't remove)
 
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 05a0260f5498..9e91604a2acc 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -124,7 +124,7 @@ class EmbeddingIntentClassifier(DIETClassifier):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
+        UNIDIRECTIONAL_ENCODER: True,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 7a0e39ff5d7f..326cd61a3acf 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -76,7 +76,7 @@ class CRFEntityExtractor(DIETClassifier):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: False,
+        SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -85,7 +85,7 @@ class CRFEntityExtractor(DIETClassifier):
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
-        BILOU_FLAG: True,
+        BILOU_FLAG: False,
     }
     # end default properties (DOC MARKER - don't remove)
 

From ab44e66b5b00ec36fe1e92dd3334dd0c2ce6ef6e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 17:31:12 +0100
Subject: [PATCH 264/633] clean up bilou utils

---
 rasa/nlu/classifiers/diet_classifier.py |  3 +--
 rasa/nlu/utils/bilou_utils.py           | 15 ++++-----------
 tests/nlu/utils/test_bilou_utils.py     | 17 -----------------
 3 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 260afa7b48ed..e30eb13a713c 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -343,8 +343,7 @@ def check_input_dimension_consistency(self, model_data: RasaModelData):
 
             if num_text_features != num_intent_features:
                 raise ValueError(
-                    "If embeddings are shared "
-                    "text features and label features "
+                    "If embeddings are shared text features and label features "
                     "must coincide. Check the output dimensions of previous components."
                 )
 
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 019117c06a93..c6f5e87d00b4 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -20,13 +20,6 @@ def entity_name_from_tag(tag: Text) -> Text:
     return tag
 
 
-def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
-    """Get the BILOU prefix (without -) from the given tag."""
-    if len(tag) >= 2 and tag[1] == "-" and tag[:2] in BILOU_PREFIXES:
-        return tag[0].upper()
-    return None
-
-
 def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
     """Maps the entity tags of the message to the ids of the provided dict."""
     if message.get(BILOU_ENTITIES_ATTRIBUTE):
@@ -75,15 +68,15 @@ def apply_bilou_schema(training_data: TrainingData):
         if not entities:
             continue
 
-        entities = map_message_entities(message)
-        output = bilou_tags_from_offsets(
+        entities = _map_message_entities(message)
+        output = _bilou_tags_from_offsets(
             message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), entities
         )
 
         message.set(BILOU_ENTITIES_ATTRIBUTE, output)
 
 
-def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
+def _map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
     """Maps the entities of the given message to their start, end, and tag values."""
 
     def convert_entity(entity):
@@ -92,7 +85,7 @@ def convert_entity(entity):
     return [convert_entity(entity) for entity in message.get(ENTITIES_ATTRIBUTE, [])]
 
 
-def bilou_tags_from_offsets(
+def _bilou_tags_from_offsets(
     tokens: List[Token], entities: List[Tuple[int, int, Text]], missing: Text = "O"
 ) -> List[Text]:
     """Creates a list of BILOU tags for the given list of tokens and entities."""
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
index bc2e9c0b9fc8..c65255128d5e 100644
--- a/tests/nlu/utils/test_bilou_utils.py
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -23,23 +23,6 @@ def test_entity_name_from_tag(tag, expected):
     assert actual == expected
 
 
-@pytest.mark.parametrize(
-    "tag, expected",
-    [
-        ("B-person", "B"),
-        ("I-location", "I"),
-        ("location", None),
-        ("U-company", "U"),
-        ("L-company", "L"),
-        ("O-company", None),
-    ],
-)
-def test_bilou_from_tag(tag, expected):
-    actual = bilou_utils.bilou_prefix_from_tag(tag)
-
-    assert actual == expected
-
-
 def test_tags_to_ids():
     message = Message("Germany is part of the European Union")
     message.set(

From 6b9f99b296ca3eff3bfcf8ac413c83cfb90f6d47 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 30 Jan 2020 18:08:24 +0100
Subject: [PATCH 265/633] check deprecated options

---
 rasa/core/policies/ted_policy.py              | 36 +++++++++++++++++--
 .../embedding_intent_classifier.py            | 11 +++++-
 rasa/nlu/extractors/crf_entity_extractor.py   |  2 +-
 rasa/utils/tensorflow/constants.py            | 33 +++++++++--------
 4 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 60af66306eae..f4a0beb6578a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -29,7 +29,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    POS_ENCODING,
     MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
@@ -78,8 +77,6 @@ class TEDPolicy(Policy):
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
-        # type of positional encoding in transformer
-        POS_ENCODING: "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
         MAX_SEQ_LENGTH: 256,
         # number of attention heads in transformer
@@ -174,6 +171,39 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         if self.config[EVAL_NUM_EPOCHS] < 1:
             self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
 
+        self._check_deprecated_options()
+
+    def _check_deprecated_options(self):
+        if "hidden_layers_sizes_pre_dial" in self.config:
+            logger.warning(
+                f"Option 'hidden_layers_sizes_pre_dial' got renamed to"
+                f" {HIDDEN_LAYERS_SIZES_DIALOGUE}. Please update your configuration "
+                f"file."
+            )
+            self.config[HIDDEN_LAYERS_SIZES_DIALOGUE] = self.config[
+                "hidden_layers_sizes_pre_dial"
+            ]
+        if "hidden_layers_sizes_bot" in self.config:
+            logger.warning(
+                f"Option 'hidden_layers_sizes_bot' got renamed to "
+                f"{HIDDEN_LAYERS_SIZES_LABEL}. Please update your configuration file."
+            )
+            self.config[HIDDEN_LAYERS_SIZES_LABEL] = self.config[
+                "hidden_layers_sizes_bot"
+            ]
+        if "droprate_a" in self.config:
+            logger.warning(
+                f"Option 'droprate_a' got renamed to {DROPRATE_DIALOGUE}. Please "
+                f"update your configuration file."
+            )
+            self.config[DROPRATE_DIALOGUE] = self.config["droprate_a"]
+        if "droprate_b" in self.config:
+            logger.warning(
+                f"Option 'droprate_b' got renamed to {DROPRATE_LABEL}. Please "
+                f"update your configuration file."
+            )
+            self.config[DROPRATE_LABEL] = self.config["droprate_b"]
+
     # data helpers
     # noinspection PyPep8Naming
     @staticmethod
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9e91604a2acc..22f8334c0a35 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -45,7 +45,7 @@
     MU_POS,
     EMBED_DIM,
 )
-from utils.common import raise_warning
+from rasa.utils.common import raise_warning
 
 logger = logging.getLogger(__name__)
 
@@ -141,6 +141,15 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
 
+        if "hidden_layers_sizes_a" in component_config:
+            component_config[HIDDEN_LAYERS_SIZES_TEXT] = component_config[
+                "hidden_layers_sizes_a"
+            ]
+        if "hidden_layers_sizes_b" in component_config:
+            component_config[HIDDEN_LAYERS_SIZES_LABEL] = component_config[
+                "hidden_layers_sizes_b"
+            ]
+
         super().__init__(component_config)
 
         raise_warning(
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 326cd61a3acf..19e0341fe184 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -34,7 +34,7 @@
     C2,
     BILOU_FLAG,
 )
-from utils.common import raise_warning
+from rasa.utils.common import raise_warning
 
 logger = logging.getLogger(__name__)
 
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index b7b3c68c2a5a..9316f7e716af 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -6,12 +6,11 @@
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
 
 TRANSFORMER_SIZE = "transformer_size"
-NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
-NUM_HEADS = "number_of_attention_heads"
+NUM_TRANSFORMER_LAYERS = "num_transformer_layers"
+NUM_HEADS = "num_heads"
 UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
 
-POS_ENCODING = "positional_encoding"
-MAX_SEQ_LENGTH = "maximum_sequence_length"
+MAX_SEQ_LENGTH = "max_seq_length"
 
 BATCH_SIZES = "batch_sizes"
 BATCH_STRATEGY = "batch_strategy"
@@ -19,31 +18,31 @@
 RANDOM_SEED = "random_seed"
 LEARNING_RATE = "learning_rate"
 
-DENSE_DIM = "dense_dimensions"
-EMBED_DIM = "embedding_dimension"
+DENSE_DIM = "dense_dim"
+EMBED_DIM = "embed_dim"
 
 SIMILARITY_TYPE = "similarity_type"
 LOSS_TYPE = "loss_type"
-NUM_NEG = "number_of_negative_examples"
-MU_POS = "maximum_positive_similarity"
-MU_NEG = "maximum_negative_similarity"
-USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
+NUM_NEG = "num_neg"
+MU_POS = "mu_pos"
+MU_NEG = "mu_neg"
+USE_MAX_SIM_NEG = "use_max_sim_neg"
 
 SCALE_LOSS = "scale_loss"
-C2 = "l2_regularization"
+C2 = "C2"
 C_EMB = "c_emb"
 DROPRATE = "droprate"
 DROPRATE_DIALOGUE = "droprate_dialogue"
 DROPRATE_LABEL = "droprate_label"
 
-EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
-EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
+EVAL_NUM_EPOCHS = "evaluate_every_num_epochs"
+EVAL_NUM_EXAMPLES = "evaluate_on_num_examples"
 
-INTENT_CLASSIFICATION = "perform_intent_classification"
-ENTITY_RECOGNITION = "perform_entity_recognition"
-MASKED_LM = "use_masked_language_model"
+INTENT_CLASSIFICATION = "intent_classification"
+ENTITY_RECOGNITION = "entity_recognition"
+MASKED_LM = "masked_language_model"
 
-SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
+SPARSE_INPUT_DROPOUT = "sparse_input_dropout"
 
 RANKING_LENGTH = "ranking_length"
 

From 55f9228e775c770a452ba679d69ec0c165a2bc28 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 31 Jan 2020 10:05:39 +0100
Subject: [PATCH 266/633] add attention dropout

---
 rasa/core/policies/TED_policy.py        |  3 +-
 rasa/nlu/classifiers/DIET_classifier.py |  5 ++-
 rasa/utils/tensorflow/tf_layers.py      | 54 ++++++++++++++++---------
 3 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/rasa/core/policies/TED_policy.py b/rasa/core/policies/TED_policy.py
index d01137c6ea44..f6baa761b5d7 100644
--- a/rasa/core/policies/TED_policy.py
+++ b/rasa/core/policies/TED_policy.py
@@ -502,7 +502,8 @@ def _prepare_layers(self) -> None:
             self.config[TRANSFORMER_SIZE] * 4,
             self.config[MAX_SEQ_LENGTH],
             self.config[C2],
-            self.config[DROPRATE_DIALOGUE],
+            dropout_rate=self.config[DROPRATE_DIALOGUE],
+            attention_dropout_rate=0,
             unidirectional=True,
             name="dialogue_encoder",
         )
diff --git a/rasa/nlu/classifiers/DIET_classifier.py b/rasa/nlu/classifiers/DIET_classifier.py
index c5acee9857fd..63100705c0a5 100644
--- a/rasa/nlu/classifiers/DIET_classifier.py
+++ b/rasa/nlu/classifiers/DIET_classifier.py
@@ -940,8 +940,9 @@ def _prepare_sequence_layers(self) -> None:
                 self.config[TRANSFORMER_SIZE] * 4,
                 self.config[MAX_SEQ_LENGTH],
                 self.config[C2],
-                self.config[DROPRATE],
-                self.config[UNIDIRECTIONAL_ENCODER],
+                dropout_rate=self.config[DROPRATE],
+                attention_dropout_rate=self.config[DROPRATE],
+                unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
                 name="text_encoder",
             )
             if self.config[NUM_TRANSFORMER_LAYERS] > 0
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 3b81a7076d70..61f9d0eda836 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -82,7 +82,7 @@ class Ffnn(tf.keras.layers.Layer):
     def __init__(
         self,
         layer_sizes: List[int],
-        droprate: float,
+        dropout_rate: float,
         reg_lambda: float,
         layer_name_suffix: Text,
     ) -> None:
@@ -99,7 +99,7 @@ def __init__(
                     name=f"hidden_layer_{layer_name_suffix}_{i}",
                 )
             )
-            self._ffn_layers.append(tf.keras.layers.Dropout(rate=droprate))
+            self._ffn_layers.append(tf.keras.layers.Dropout(dropout_rate))
 
     def call(self, x: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
         for layer in self._ffn_layers:
@@ -146,10 +146,8 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
 # from https://www.tensorflow.org/tutorials/text/transformer
 # and https://github.com/tensorflow/tensor2tensor
 # TODO implement relative attention
-# TODO save attention weights
 class MultiHeadAttention(tf.keras.layers.Layer):
-    @staticmethod
-    def _scaled_dot_product_attention(q, k, v, pad_mask):
+    def _scaled_dot_product_attention(self, q, k, v, pad_mask, training):
         """Calculate the attention weights.
         q, k, v must have matching leading dimensions.
         k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
@@ -183,11 +181,17 @@ def _scaled_dot_product_attention(q, k, v, pad_mask):
             logits, axis=-1
         )  # (..., seq_len_q, seq_len_k)
 
+        attention_weights = self._attention_dropout(
+            attention_weights, training=training
+        )
+
         output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
 
         return output, attention_weights
 
-    def __init__(self, d_model: int, num_heads: int) -> None:
+    def __init__(
+        self, d_model: int, num_heads: int, attention_dropout_rate: float
+    ) -> None:
         super().__init__()
 
         self.num_heads = num_heads
@@ -200,6 +204,9 @@ def __init__(self, d_model: int, num_heads: int) -> None:
         self._wq = DenseWithSparseWeights(units=d_model, use_bias=False)
         self._wk = DenseWithSparseWeights(units=d_model, use_bias=False)
         self._wv = DenseWithSparseWeights(units=d_model, use_bias=False)
+
+        self._attention_dropout = tf.keras.layers.Dropout(attention_dropout_rate)
+
         self._dense = DenseWithSparseWeights(units=d_model)
 
     def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
@@ -234,7 +241,8 @@ def call(
         v: tf.Tensor,
         k: tf.Tensor,
         q: tf.Tensor,
-        pad_mask: Optional[tf.Tensor] = None,
+        pad_mask: Optional[tf.Tensor],
+        training: tf.Tensor,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         q = self._wq(q)  # (batch_size, seq_len_q, d_model)
         k = self._wk(k)  # (batch_size, seq_len_k, d_model)
@@ -245,7 +253,7 @@ def call(
         v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
 
         attention, attention_weights = self._scaled_dot_product_attention(
-            q, k, v, pad_mask
+            q, k, v, pad_mask, training
         )
         # attention.shape == (batch_size, num_heads, seq_len_q, depth)
         # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
@@ -258,27 +266,32 @@ def call(
 
 class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(
-        self, d_model: int, num_heads: int, dff: int, rate: float = 0.1
+        self,
+        d_model: int,
+        num_heads: int,
+        dff: int,
+        dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
     ) -> None:
         super().__init__()
 
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self._mha = MultiHeadAttention(d_model, num_heads)
-        self._dropout = tf.keras.layers.Dropout(rate)
+        self._mha = MultiHeadAttention(d_model, num_heads, attention_dropout_rate)
+        self._dropout = tf.keras.layers.Dropout(dropout_rate)
 
         self._ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
             DenseWithSparseWeights(
                 units=dff, activation=tfa.activations.gelu
             ),  # (batch_size, seq_len, dff)
-            tf.keras.layers.Dropout(rate),
+            tf.keras.layers.Dropout(dropout_rate),
             DenseWithSparseWeights(units=d_model),  # (batch_size, seq_len, d_model)
-            tf.keras.layers.Dropout(rate),
+            tf.keras.layers.Dropout(dropout_rate),
         ]
 
     def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
         x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
-        attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask)
+        attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask, training=training)
         attn_out = self._dropout(attn_out, training=training)
         x += attn_out
 
@@ -298,8 +311,8 @@ def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
 
     @staticmethod
     def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
-        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
-        return pos * angle_rates
+        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+        return pos * angle_dropout_rates
 
     @classmethod
     def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
@@ -327,7 +340,8 @@ def __init__(
         dff: int,
         max_seq_length: int,
         reg_lambda: float,
-        rate: float = 0.1,
+        dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
         name: Optional[Text] = None,
     ) -> None:
@@ -343,10 +357,12 @@ def __init__(
 
         self._pos_encoding = self._positional_encoding(max_seq_length, self.d_model)
 
-        self._dropout = tf.keras.layers.Dropout(rate)
+        self._dropout = tf.keras.layers.Dropout(dropout_rate)
 
         self._enc_layers = [
-            TransformerEncoderLayer(d_model, num_heads, dff, rate)
+            TransformerEncoderLayer(
+                d_model, num_heads, dff, dropout_rate, attention_dropout_rate
+            )
             for _ in range(num_layers)
         ]
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)

From 729ac40de2a8d7ee0440b0057cb995f70c6a564a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 31 Jan 2020 10:43:57 +0100
Subject: [PATCH 267/633] put transformer into separate file

---
 rasa/core/policies/ted_policy.py              |  19 +-
 rasa/nlu/classifiers/diet_classifier.py       |  37 +--
 .../selectors/embedding_response_selector.py  |   2 +-
 .../tensorflow/{tf_layers.py => layers.py}    | 250 ------------------
 .../{tf_model_data.py => model_data.py}       |   0
 .../tensorflow/{tf_models.py => models.py}    |   2 +-
 tests/utils/test_tf_model_data.py             |   2 +-
 7 files changed, 32 insertions(+), 280 deletions(-)
 rename rasa/utils/tensorflow/{tf_layers.py => layers.py} (68%)
 rename rasa/utils/tensorflow/{tf_model_data.py => model_data.py} (100%)
 rename rasa/utils/tensorflow/{tf_models.py => models.py} (99%)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index a166d570ef03..dc6a3a63780e 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -21,9 +21,10 @@
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils import train_utils
-from rasa.utils.tensorflow import tf_layers
-from rasa.utils.tensorflow.tf_models import RasaModel
-from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow import layers
+from rasa.utils.tensorflow.transformer import TransformerEncoder
+from rasa.utils.tensorflow.models import RasaModel
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_LABEL,
     TRANSFORMER_SIZE,
@@ -472,7 +473,7 @@ def __init__(
         self._prepare_layers()
 
     def _prepare_layers(self) -> None:
-        self._tf_layers["loss.label"] = tf_layers.DotProductLoss(
+        self._tf_layers["loss.label"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -483,19 +484,19 @@ def _prepare_layers(self) -> None:
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
         )
-        self._tf_layers["ffnn.dialogue"] = tf_layers.Ffnn(
+        self._tf_layers["ffnn.dialogue"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_DIALOGUE],
             self.config[DROPRATE_DIALOGUE],
             self.config[C2],
             layer_name_suffix="dialogue",
         )
-        self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
+        self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE_LABEL],
             self.config[C2],
             layer_name_suffix="label",
         )
-        self._tf_layers["transformer"] = tf_layers.TransformerEncoder(
+        self._tf_layers["transformer"] = TransformerEncoder(
             self.config[NUM_TRANSFORMER_LAYERS],
             self.config[TRANSFORMER_SIZE],
             self.config[NUM_HEADS],
@@ -507,13 +508,13 @@ def _prepare_layers(self) -> None:
             unidirectional=True,
             name="dialogue_encoder",
         )
-        self._tf_layers["embed.dialogue"] = tf_layers.Embed(
+        self._tf_layers["embed.dialogue"] = layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "dialogue",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed.label"] = tf_layers.Embed(
+        self._tf_layers["embed.label"] = layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "label",
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 1602cbca2678..21423b1c5346 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -18,9 +18,10 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import any_of
 from rasa.utils import train_utils
-from rasa.utils.tensorflow import tf_layers
-from rasa.utils.tensorflow.tf_models import RasaModel
-from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow import layers
+from rasa.utils.tensorflow.transformer import TransformerEncoder
+from rasa.utils.tensorflow.models import RasaModel
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
@@ -905,7 +906,7 @@ def _create_sparse_dense_layer(
         name: Text,
         reg_lambda: float,
         dense_dim: int,
-    ) -> Optional[tf_layers.DenseForSparse]:
+    ) -> Optional[layers.DenseForSparse]:
 
         sparse = False
         for is_sparse, shape in feature_signatures:
@@ -917,12 +918,12 @@ def _create_sparse_dense_layer(
                 dense_dim = shape[-1]
 
         if sparse:
-            return tf_layers.DenseForSparse(
+            return layers.DenseForSparse(
                 units=dense_dim, reg_lambda=reg_lambda, name=name
             )
 
     def _prepare_sequence_layers(self) -> None:
-        self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
+        self._tf_layers["sparse_dropout"] = layers.SparseDropout(
             rate=self.config[DROPRATE]
         )
         self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
@@ -937,20 +938,20 @@ def _prepare_sequence_layers(self) -> None:
             self.config[C2],
             self.config[DENSE_DIM]["label"],
         )
-        self._tf_layers["ffnn.text"] = tf_layers.Ffnn(
+        self._tf_layers["ffnn.text"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_TEXT],
             self.config[DROPRATE],
             self.config[C2],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
         )
-        self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
+        self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE],
             self.config[C2],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
         )
         self._tf_layers["transformer"] = (
-            tf_layers.TransformerEncoder(
+            TransformerEncoder(
                 self.config[NUM_TRANSFORMER_LAYERS],
                 self.config[TRANSFORMER_SIZE],
                 self.config[NUM_HEADS],
@@ -967,20 +968,20 @@ def _prepare_sequence_layers(self) -> None:
         )
 
     def _prepare_mask_lm_layers(self) -> None:
-        self._tf_layers["input_mask"] = tf_layers.InputMask()
-        self._tf_layers["embed.lm_mask"] = tf_layers.Embed(
+        self._tf_layers["input_mask"] = layers.InputMask()
+        self._tf_layers["embed.lm_mask"] = layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "lm_mask",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed.golden_token"] = tf_layers.Embed(
+        self._tf_layers["embed.golden_token"] = layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "golden_token",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["loss.mask"] = tf_layers.DotProductLoss(
+        self._tf_layers["loss.mask"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -993,19 +994,19 @@ def _prepare_mask_lm_layers(self) -> None:
         )
 
     def _prepare_intent_classification_layers(self) -> None:
-        self._tf_layers["embed.text"] = tf_layers.Embed(
+        self._tf_layers["embed.text"] = layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "text",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed.label"] = tf_layers.Embed(
+        self._tf_layers["embed.label"] = layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
             "label",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["loss.label"] = tf_layers.DotProductLoss(
+        self._tf_layers["loss.label"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -1018,10 +1019,10 @@ def _prepare_intent_classification_layers(self) -> None:
         )
 
     def _prepare_entity_recognition_layers(self) -> None:
-        self._tf_layers["embed.logits"] = tf_layers.Embed(
+        self._tf_layers["embed.logits"] = layers.Embed(
             self._num_tags, self.config[C2], "logits"
         )
-        self._tf_layers["crf"] = tf_layers.CRF(self._num_tags, self.config[C2])
+        self._tf_layers["crf"] = layers.CRF(self._num_tags, self.config[C2])
         self._tf_layers["crf_f1_score"] = tfa.metrics.F1Score(
             num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
             average="micro",
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 1b6a04278a17..40c073966466 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -47,7 +47,7 @@
     TEXT_ATTRIBUTE,
     SPARSE_FEATURE_NAMES,
 )
-from rasa.utils.tensorflow.tf_model_data import RasaModelData
+from rasa.utils.tensorflow.model_data import RasaModelData
 
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/layers.py
similarity index 68%
rename from rasa/utils/tensorflow/tf_layers.py
rename to rasa/utils/tensorflow/layers.py
index 61f9d0eda836..4c2cf593c2c1 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -3,7 +3,6 @@
 import tensorflow as tf
 import tensorflow_addons as tfa
 from tensorflow.python.keras.utils import tf_utils
-import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -143,255 +142,6 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
         return x
 
 
-# from https://www.tensorflow.org/tutorials/text/transformer
-# and https://github.com/tensorflow/tensor2tensor
-# TODO implement relative attention
-class MultiHeadAttention(tf.keras.layers.Layer):
-    def _scaled_dot_product_attention(self, q, k, v, pad_mask, training):
-        """Calculate the attention weights.
-        q, k, v must have matching leading dimensions.
-        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
-        The mask has different shapes depending on its type(padding or look ahead)
-        but it must be broadcastable for addition.
-
-        Args:
-          q: query shape == (..., seq_len_q, depth)
-          k: key shape == (..., seq_len_k, depth)
-          v: value shape == (..., seq_len_v, depth_v)
-          pad_mask: Float tensor with shape broadcastable
-                to (..., seq_len_q, seq_len_k). Defaults to None.
-
-        Returns:
-          output, attention_weights
-        """
-
-        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
-
-        # scale matmul_qk
-        dk = tf.cast(tf.shape(k)[-1], tf.float32)
-        logits = matmul_qk / tf.math.sqrt(dk)
-
-        # add the mask to the scaled tensor.
-        if pad_mask is not None:
-            logits += pad_mask * -1e9
-
-        # softmax is normalized on the last axis (seq_len_k) so that the scores
-        # add up to 1.
-        attention_weights = tf.nn.softmax(
-            logits, axis=-1
-        )  # (..., seq_len_q, seq_len_k)
-
-        attention_weights = self._attention_dropout(
-            attention_weights, training=training
-        )
-
-        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
-
-        return output, attention_weights
-
-    def __init__(
-        self, d_model: int, num_heads: int, attention_dropout_rate: float
-    ) -> None:
-        super().__init__()
-
-        self.num_heads = num_heads
-        self.d_model = d_model
-
-        assert d_model % self.num_heads == 0
-
-        self._depth = d_model // self.num_heads
-
-        self._wq = DenseWithSparseWeights(units=d_model, use_bias=False)
-        self._wk = DenseWithSparseWeights(units=d_model, use_bias=False)
-        self._wv = DenseWithSparseWeights(units=d_model, use_bias=False)
-
-        self._attention_dropout = tf.keras.layers.Dropout(attention_dropout_rate)
-
-        self._dense = DenseWithSparseWeights(units=d_model)
-
-    def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
-        """Split the last dimension into (num_heads, depth).
-
-        Transpose the result such that the shape is
-        (batch_size, num_heads, seq_len, depth)
-        """
-
-        x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth))
-        return tf.transpose(x, perm=[0, 2, 1, 3])
-
-    def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
-        """Inverse of split_heads.
-
-        Args:
-          x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
-
-        Returns:
-          a Tensor with shape [batch, length, channels]
-        """
-
-        x = tf.transpose(
-            x, perm=[0, 2, 1, 3]
-        )  # (batch_size, seq_len_q, num_heads, depth)
-        return tf.reshape(
-            x, (tf.shape(x)[0], -1, self.d_model)
-        )  # (batch_size, seq_len_q, d_model)
-
-    def call(
-        self,
-        v: tf.Tensor,
-        k: tf.Tensor,
-        q: tf.Tensor,
-        pad_mask: Optional[tf.Tensor],
-        training: tf.Tensor,
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
-        q = self._wq(q)  # (batch_size, seq_len_q, d_model)
-        k = self._wk(k)  # (batch_size, seq_len_k, d_model)
-        v = self._wv(v)  # (batch_size, seq_len_v, d_model)
-
-        q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
-        k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
-        v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
-
-        attention, attention_weights = self._scaled_dot_product_attention(
-            q, k, v, pad_mask, training
-        )
-        # attention.shape == (batch_size, num_heads, seq_len_q, depth)
-        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
-        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, d_model)
-
-        output = self._dense(attention)  # (batch_size, seq_len_q, d_model)
-
-        return output, attention_weights
-
-
-class TransformerEncoderLayer(tf.keras.layers.Layer):
-    def __init__(
-        self,
-        d_model: int,
-        num_heads: int,
-        dff: int,
-        dropout_rate: float = 0.1,
-        attention_dropout_rate: float = 0.0,
-    ) -> None:
-        super().__init__()
-
-        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-        self._mha = MultiHeadAttention(d_model, num_heads, attention_dropout_rate)
-        self._dropout = tf.keras.layers.Dropout(dropout_rate)
-
-        self._ffn_layers = [
-            tf.keras.layers.LayerNormalization(epsilon=1e-6),
-            DenseWithSparseWeights(
-                units=dff, activation=tfa.activations.gelu
-            ),  # (batch_size, seq_len, dff)
-            tf.keras.layers.Dropout(dropout_rate),
-            DenseWithSparseWeights(units=d_model),  # (batch_size, seq_len, d_model)
-            tf.keras.layers.Dropout(dropout_rate),
-        ]
-
-    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
-        x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
-        attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask, training=training)
-        attn_out = self._dropout(attn_out, training=training)
-        x += attn_out
-
-        ffn_out = x  # (batch_size, seq_len, d_model)
-        for layer in self._ffn_layers:
-            ffn_out = layer(ffn_out, training=training)
-        x += ffn_out
-
-        return x  # (batch_size, seq_len, d_model)
-
-
-class TransformerEncoder(tf.keras.layers.Layer):
-    @staticmethod
-    def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
-        pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
-        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
-
-    @staticmethod
-    def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
-        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
-        return pos * angle_dropout_rates
-
-    @classmethod
-    def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
-        angle_rads = cls._get_angles(
-            np.arange(max_position)[:, np.newaxis],
-            np.arange(d_model)[np.newaxis, :],
-            d_model,
-        )
-
-        # apply sin to even indices in the array; 2i
-        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
-
-        # apply cos to odd indices in the array; 2i+1
-        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
-
-        pos_encoding = angle_rads[np.newaxis, ...]
-
-        return tf.cast(pos_encoding, dtype=tf.float32)
-
-    def __init__(
-        self,
-        num_layers: int,
-        d_model: int,
-        num_heads: int,
-        dff: int,
-        max_seq_length: int,
-        reg_lambda: float,
-        dropout_rate: float = 0.1,
-        attention_dropout_rate: float = 0.0,
-        unidirectional: bool = False,
-        name: Optional[Text] = None,
-    ) -> None:
-        super().__init__(name=name)
-
-        self.d_model = d_model
-        self.unidirectional = unidirectional
-
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
-        self._embedding = DenseWithSparseWeights(
-            units=d_model, kernel_regularizer=l2_regularizer
-        )
-
-        self._pos_encoding = self._positional_encoding(max_seq_length, self.d_model)
-
-        self._dropout = tf.keras.layers.Dropout(dropout_rate)
-
-        self._enc_layers = [
-            TransformerEncoderLayer(
-                d_model, num_heads, dff, dropout_rate, attention_dropout_rate
-            )
-            for _ in range(num_layers)
-        ]
-        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
-
-    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
-
-        # adding embedding and position encoding.
-        x = self._embedding(x)  # (batch_size, seq_len, d_model)
-        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self._pos_encoding[:, : tf.shape(x)[1], :] * (1 - pad_mask)
-        x = self._dropout(x, training=training)
-
-        pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
-        pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
-        if self.unidirectional:
-            # add look ahead pad mask to emulate unidirectional behavior
-            pad_mask = tf.minimum(
-                1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
-            )  # (batch_size, 1, seq_len, seq_len)
-
-        for layer in self._enc_layers:
-            x = layer(x, pad_mask, training)  # (batch_size, seq_len, d_model)
-
-        # if normalization is done in encoding layers, then it should also be done
-        # on the output, since the output can grow very large, being the sum of
-        # a whole stack of unnormalized layer outputs.
-        return self._layernorm(x)  # (batch_size, seq_len, d_model)
-
-
 class InputMask(tf.keras.layers.Layer):
     def build(self, input_shape: tf.TensorShape) -> None:
         initializer = tf.keras.initializers.GlorotUniform()
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/model_data.py
similarity index 100%
rename from rasa/utils/tensorflow/tf_model_data.py
rename to rasa/utils/tensorflow/model_data.py
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/models.py
similarity index 99%
rename from rasa/utils/tensorflow/tf_models.py
rename to rasa/utils/tensorflow/models.py
index f1ad5694ae35..acde5e944bda 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -5,7 +5,7 @@
 from typing import List, Text, Dict, Tuple, Union, Optional, Callable
 from tqdm import tqdm
 from rasa.utils.common import is_logging_disabled
-from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index 17f58ed787dc..b1560d9f72f1 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -4,7 +4,7 @@
 import scipy.sparse
 import numpy as np
 
-from rasa.utils.tensorflow.tf_model_data import RasaModelData
+from rasa.utils.tensorflow.model_data import RasaModelData
 
 
 @pytest.fixture

From 71077118879805c4f8dafbd74a7b00161a7cf738 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 31 Jan 2020 11:24:14 +0100
Subject: [PATCH 268/633] add transformer file

---
 rasa/utils/tensorflow/transformer.py | 315 +++++++++++++++++++++++++++
 1 file changed, 315 insertions(+)
 create mode 100644 rasa/utils/tensorflow/transformer.py

diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
new file mode 100644
index 000000000000..9c5ce3960da0
--- /dev/null
+++ b/rasa/utils/tensorflow/transformer.py
@@ -0,0 +1,315 @@
+from typing import List, Optional, Text, Tuple, Callable
+import tensorflow as tf
+import tensorflow_addons as tfa
+import numpy as np
+from rasa.utils.tensorflow.layers import DenseWithSparseWeights
+
+
+# from https://www.tensorflow.org/tutorials/text/transformer
+# and https://github.com/tensorflow/tensor2tensor
+class MultiHeadAttention(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        attention_dropout_rate: float = 0.0,
+        unidirectional: bool = False,
+        use_relative_position: bool = False,
+        max_relative_position: Optional[int] = None,
+        heads_share_relative_embedding: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.d_model = d_model
+        self.unidirectional = unidirectional
+        self.use_relative_position = use_relative_position
+        self.max_relative_position = max_relative_position
+
+        assert d_model % self.num_heads == 0
+
+        self._depth = d_model // self.num_heads
+
+        self._wq = DenseWithSparseWeights(units=d_model, use_bias=False)
+        self._wk = DenseWithSparseWeights(units=d_model, use_bias=False)
+        self._wv = DenseWithSparseWeights(units=d_model, use_bias=False)
+
+        if use_relative_position:
+            if not max_relative_position:
+                raise ValueError(
+                    f"Max relative position {max_relative_position} "
+                    f"should be > 0 when using relative attention."
+                )
+
+            if unidirectional:
+                max_relative_position_unmasked = max_relative_position
+            else:
+                max_relative_position_unmasked = 2 * max_relative_position - 1
+
+            if heads_share_relative_embedding:
+                relative_embedding_shape = (max_relative_position_unmasked, self._depth)
+            else:
+                relative_embedding_shape = (
+                    num_heads,
+                    max_relative_position_unmasked,
+                    self._depth,
+                )
+
+            initializer = tf.keras.initializers.TruncatedNormal(
+                stddev=self._depth ** -0.5
+            )
+            self.key_relative_embeddings = self.add_weight(
+                shape=relative_embedding_shape,
+                initializer=initializer,
+                trainable=True,
+                name="key_relative_embeddings",
+            )
+            self.value_relative_embeddings = self.add_weight(
+                shape=relative_embedding_shape,
+                initializer=initializer,
+                trainable=True,
+                name="value_relative_embeddings",
+            )
+        else:
+            self.key_relative_embeddings = None
+            self.value_relative_embeddings = None
+
+        self._attention_dropout = tf.keras.layers.Dropout(attention_dropout_rate)
+
+        self._dense = DenseWithSparseWeights(units=d_model)
+
+    def _scaled_dot_product_attention(self, q, k, v, pad_mask, training):
+        """Calculate the attention weights.
+        q, k, v must have matching leading dimensions.
+        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
+        The mask has different shapes depending on its type(padding or look ahead)
+        but it must be broadcastable for addition.
+
+        Args:
+          q: query shape == (..., seq_len_q, depth)
+          k: key shape == (..., seq_len_k, depth)
+          v: value shape == (..., seq_len_v, depth_v)
+          pad_mask: Float tensor with shape broadcastable
+                to (..., seq_len_q, seq_len_k). Defaults to None.
+
+        Returns:
+          output, attention_weights
+        """
+
+        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
+
+        # TODO add key relative embeddings
+
+        # scale matmul_qk
+        dk = tf.cast(tf.shape(k)[-1], tf.float32)
+        logits = matmul_qk / tf.math.sqrt(dk)
+
+        # add the mask to the scaled tensor.
+        if pad_mask is not None:
+            logits += pad_mask * -1e9
+
+        # softmax is normalized on the last axis (seq_len_k) so that the scores
+        # add up to 1.
+        attention_weights = tf.nn.softmax(
+            logits, axis=-1
+        )  # (..., seq_len_q, seq_len_k)
+
+        attention_weights = self._attention_dropout(
+            attention_weights, training=training
+        )
+
+        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
+
+        # TODO add value relative embedding to values
+
+        return output, attention_weights
+
+    def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
+        """Split the last dimension into (num_heads, depth).
+
+        Transpose the result such that the shape is
+        (batch_size, num_heads, seq_len, depth)
+        """
+
+        x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
+        """Inverse of split_heads.
+
+        Args:
+          x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
+
+        Returns:
+          a Tensor with shape [batch, length, channels]
+        """
+
+        x = tf.transpose(
+            x, perm=[0, 2, 1, 3]
+        )  # (batch_size, seq_len_q, num_heads, depth)
+        return tf.reshape(
+            x, (tf.shape(x)[0], -1, self.d_model)
+        )  # (batch_size, seq_len_q, d_model)
+
+    def call(
+        self,
+        v: tf.Tensor,
+        k: tf.Tensor,
+        q: tf.Tensor,
+        pad_mask: Optional[tf.Tensor],
+        training: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        q = self._wq(q)  # (batch_size, seq_len_q, d_model)
+        k = self._wk(k)  # (batch_size, seq_len_k, d_model)
+        v = self._wv(v)  # (batch_size, seq_len_v, d_model)
+
+        q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
+        k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
+        v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
+
+        attention, attention_weights = self._scaled_dot_product_attention(
+            q, k, v, pad_mask, training
+        )
+        # attention.shape == (batch_size, num_heads, seq_len_q, depth)
+        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
+        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, d_model)
+
+        output = self._dense(attention)  # (batch_size, seq_len_q, d_model)
+
+        return output, attention_weights
+
+
+class TransformerEncoderLayer(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dff: int,
+        dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        unidirectional: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self._mha = MultiHeadAttention(
+            d_model, num_heads, attention_dropout_rate, unidirectional
+        )
+        self._dropout = tf.keras.layers.Dropout(dropout_rate)
+
+        self._ffn_layers = [
+            tf.keras.layers.LayerNormalization(epsilon=1e-6),
+            DenseWithSparseWeights(
+                units=dff, activation=tfa.activations.gelu
+            ),  # (batch_size, seq_len, dff)
+            tf.keras.layers.Dropout(dropout_rate),
+            DenseWithSparseWeights(units=d_model),  # (batch_size, seq_len, d_model)
+            tf.keras.layers.Dropout(dropout_rate),
+        ]
+
+    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
+        x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
+        attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask, training=training)
+        attn_out = self._dropout(attn_out, training=training)
+        x += attn_out
+
+        ffn_out = x  # (batch_size, seq_len, d_model)
+        for layer in self._ffn_layers:
+            ffn_out = layer(ffn_out, training=training)
+        x += ffn_out
+
+        return x  # (batch_size, seq_len, d_model)
+
+
+class TransformerEncoder(tf.keras.layers.Layer):
+    @staticmethod
+    def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
+        pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
+        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
+
+    @staticmethod
+    def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
+        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+        return pos * angle_dropout_rates
+
+    @classmethod
+    def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
+        angle_rads = cls._get_angles(
+            np.arange(max_position)[:, np.newaxis],
+            np.arange(d_model)[np.newaxis, :],
+            d_model,
+        )
+
+        # apply sin to even indices in the array; 2i
+        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+
+        # apply cos to odd indices in the array; 2i+1
+        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+
+        pos_encoding = angle_rads[np.newaxis, ...]
+
+        return tf.cast(pos_encoding, dtype=tf.float32)
+
+    def __init__(
+        self,
+        num_layers: int,
+        d_model: int,
+        num_heads: int,
+        dff: int,
+        max_seq_length: int,
+        reg_lambda: float,
+        dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        unidirectional: bool = False,
+        name: Optional[Text] = None,
+    ) -> None:
+        super().__init__(name=name)
+
+        self.d_model = d_model
+        self.unidirectional = unidirectional
+
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._embedding = DenseWithSparseWeights(
+            units=d_model, kernel_regularizer=l2_regularizer
+        )
+
+        self._pos_encoding = self._positional_encoding(max_seq_length, self.d_model)
+
+        self._dropout = tf.keras.layers.Dropout(dropout_rate)
+
+        self._enc_layers = [
+            TransformerEncoderLayer(
+                d_model,
+                num_heads,
+                dff,
+                dropout_rate,
+                attention_dropout_rate,
+                unidirectional,
+            )
+            for _ in range(num_layers)
+        ]
+        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+
+    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
+
+        # adding embedding and position encoding.
+        x = self._embedding(x)  # (batch_size, seq_len, d_model)
+        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
+        x += self._pos_encoding[:, : tf.shape(x)[1], :] * (1 - pad_mask)
+        x = self._dropout(x, training=training)
+
+        pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
+        pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
+        if self.unidirectional:
+            # add look ahead pad mask to emulate unidirectional behavior
+            pad_mask = tf.minimum(
+                1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
+            )  # (batch_size, 1, seq_len, seq_len)
+
+        for layer in self._enc_layers:
+            x = layer(x, pad_mask, training)  # (batch_size, seq_len, d_model)
+
+        # if normalization is done in encoding layers, then it should also be done
+        # on the output, since the output can grow very large, being the sum of
+        # a whole stack of unnormalized layer outputs.
+        return self._layernorm(x)  # (batch_size, seq_len, d_model)

From 2f14324c218c2598b2a67d73c1f10a41ec404c93 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 31 Jan 2020 19:34:52 +0100
Subject: [PATCH 269/633] fix imports and docs

---
 docs/core/policies.rst              | 2 +-
 docs/nlu/components.rst             | 2 +-
 tests/nlu/utils/test_bilou_utils.py | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 4d44423bbc13..ba06a147e62d 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -345,7 +345,7 @@ It is recommended to use
     These parameters can be specified in the policy configuration file.
     The default values are defined in ``EmbeddingPolicy.defaults``:
 
-    .. literalinclude:: ../../rasa/core/policies/TED_policy.py
+    .. literalinclude:: ../../rasa/core/policies/ted_policy.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index cfc81e29fd9f..c5f6c2a8a060 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -578,7 +578,7 @@ DIETClassifier
     In the config, you can specify these parameters.
     The default values are defined in ``DIETClassifier.defaults``:
 
-    .. literalinclude:: ../../rasa/nlu/classifiers/DIET_classifier.py
+    .. literalinclude:: ../../rasa/nlu/classifiers/diet_classifier.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
index bc2e9c0b9fc8..292a21bda0b9 100644
--- a/tests/nlu/utils/test_bilou_utils.py
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -1,10 +1,9 @@
 import pytest
 
 import rasa.nlu.utils.bilou_utils as bilou_utils
-from nlu.constants import BILOU_ENTITIES_ATTRIBUTE, ENTITIES_ATTRIBUTE
-from nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from nlu.training_data import TrainingData
-from rasa.nlu.training_data import Message
+from rasa.nlu.constants import BILOU_ENTITIES_ATTRIBUTE, ENTITIES_ATTRIBUTE
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.training_data import TrainingData, Message
 
 
 @pytest.mark.parametrize(

From dbef5f7c97926e37c4a8ac71b6075d4b8dc74ca9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 11:29:59 +0100
Subject: [PATCH 270/633] ensure training works

---
 .../classifiers/embedding_intent_classifier.py    |  8 +++++---
 rasa/nlu/extractors/crf_entity_extractor.py       |  6 +++---
 rasa/nlu/selectors/embedding_response_selector.py | 15 +++++++--------
 rasa/utils/tensorflow/tf_models.py                |  2 +-
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 22f8334c0a35..b050d669d3b3 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -44,6 +44,7 @@
     MU_NEG,
     MU_POS,
     EMBED_DIM,
+    BILOU_FLAG,
 )
 from rasa.utils.common import raise_warning
 
@@ -140,6 +141,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
+        component_config[BILOU_FLAG] = False
 
         if "hidden_layers_sizes_a" in component_config:
             component_config[HIDDEN_LAYERS_SIZES_TEXT] = component_config[
@@ -153,7 +155,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super().__init__(component_config)
 
         raise_warning(
-            f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead ."
-            f" Check '{DOCS_BASE_URL}/nlu/components/' for more details.",
-            DeprecationWarning,
+            f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead .",
+            category=DeprecationWarning,
+            docs=f"{DOCS_BASE_URL}/nlu/components/",
         )
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 19e0341fe184..8157f829089a 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -105,7 +105,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
         raise_warning(
             f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in"
-            f"combination with the 'LexicalSyntacticFeaturizer'. Check "
-            f"Check '{DOCS_BASE_URL}/nlu/components/' for more details.",
-            DeprecationWarning,
+            f"combination with the 'LexicalSyntacticFeaturizer'.",
+            category=DeprecationWarning,
+            docs=f"{DOCS_BASE_URL}/nlu/components/",
         )
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index c201811591be..f0f87c94bb1e 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -11,7 +11,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    POS_ENCODING,
     MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
@@ -38,6 +37,7 @@
     MU_NEG,
     MU_POS,
     EMBED_DIM,
+    BILOU_FLAG,
 )
 from rasa.nlu.constants import (
     RESPONSE_ATTRIBUTE,
@@ -101,8 +101,6 @@ class ResponseSelector(DIETClassifier):
         NUM_TRANSFORMER_LAYERS: 1,
         # number of attention heads in transformer
         NUM_HEADS: 4,
-        # type of positional encoding in transformer
-        POS_ENCODING: "timing",  # string 'timing' or 'emb'
         # max sequence length if pos_encoding='emb'
         MAX_SEQ_LENGTH: 256,
         # training parameters
@@ -165,12 +163,13 @@ class ResponseSelector(DIETClassifier):
     # end default properties (DOC MARKER - don't remove)
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
-        super().__init__(component_config)
-
         # ResponseSelector should not be able to set the following properties
-        self.component_config[INTENT_CLASSIFICATION] = True
-        self.component_config[ENTITY_RECOGNITION] = False
-        self.component_config[MASKED_LM] = False
+        component_config[INTENT_CLASSIFICATION] = True
+        component_config[ENTITY_RECOGNITION] = False
+        component_config[MASKED_LM] = False
+        component_config[BILOU_FLAG] = False
+
+        super().__init__(component_config)
 
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index 7fd885ee5449..f1ad5694ae35 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -48,7 +48,7 @@ def fit(
         evaluate_every_num_epochs: int,
         batch_strategy: Text,
         silent: bool = False,
-        eager: bool = True,
+        eager: bool = False,
     ) -> None:
         """Fit model data"""
 

From 21d1e7e1f401d8c7e3ef1b4712e96e2dc40cf4b8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 14:02:31 +0100
Subject: [PATCH 271/633] update model option names

---
 rasa/nlu/classifiers/diet_classifier.py       |  4 +
 .../embedding_intent_classifier.py            | 10 ---
 rasa/nlu/extractors/crf_entity_extractor.py   |  1 -
 rasa/utils/tensorflow/constants.py            | 28 +++----
 rasa/utils/train_utils.py                     | 74 ++++++++++++++++++-
 5 files changed, 91 insertions(+), 26 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index e30eb13a713c..d50d911e2a9d 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -188,6 +188,10 @@ class DIETClassifier(EntityExtractor):
 
     # init helpers
     def _check_config_parameters(self) -> None:
+        self.component_config = train_utils.check_deprecated_options(
+            self.component_config
+        )
+
         if self.component_config[INTENT_CLASSIFICATION]:
             if (
                 self.component_config[SHARE_HIDDEN_LAYERS]
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b050d669d3b3..7c83196d9b0a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -137,21 +137,11 @@ class EmbeddingIntentClassifier(DIETClassifier):
     # end default properties (DOC MARKER - don't remove)
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
         component_config[BILOU_FLAG] = False
 
-        if "hidden_layers_sizes_a" in component_config:
-            component_config[HIDDEN_LAYERS_SIZES_TEXT] = component_config[
-                "hidden_layers_sizes_a"
-            ]
-        if "hidden_layers_sizes_b" in component_config:
-            component_config[HIDDEN_LAYERS_SIZES_LABEL] = component_config[
-                "hidden_layers_sizes_b"
-            ]
-
         super().__init__(component_config)
 
         raise_warning(
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 8157f829089a..a85fcb7f5306 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -90,7 +90,6 @@ class CRFEntityExtractor(DIETClassifier):
     # end default properties (DOC MARKER - don't remove)
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-
         component_config[INTENT_CLASSIFICATION] = False
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 9316f7e716af..bbc959acca92 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -6,11 +6,11 @@
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
 
 TRANSFORMER_SIZE = "transformer_size"
-NUM_TRANSFORMER_LAYERS = "num_transformer_layers"
-NUM_HEADS = "num_heads"
+NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
+NUM_HEADS = "number_of_attention_heads"
 UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
 
-MAX_SEQ_LENGTH = "max_seq_length"
+MAX_SEQ_LENGTH = "maximum_sequence_length"
 
 BATCH_SIZES = "batch_sizes"
 BATCH_STRATEGY = "batch_strategy"
@@ -18,31 +18,31 @@
 RANDOM_SEED = "random_seed"
 LEARNING_RATE = "learning_rate"
 
-DENSE_DIM = "dense_dim"
-EMBED_DIM = "embed_dim"
+DENSE_DIM = "dense_dimension"
+EMBED_DIM = "embedding_dimension"
 
 SIMILARITY_TYPE = "similarity_type"
 LOSS_TYPE = "loss_type"
-NUM_NEG = "num_neg"
-MU_POS = "mu_pos"
-MU_NEG = "mu_neg"
-USE_MAX_SIM_NEG = "use_max_sim_neg"
+NUM_NEG = "number_of_negative_examples"
+MU_POS = "maximum_positive_similarity"
+MU_NEG = "maximum_negative_similarity"
+USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
 
 SCALE_LOSS = "scale_loss"
-C2 = "C2"
+C2 = "l2_regularization"
 C_EMB = "c_emb"
 DROPRATE = "droprate"
 DROPRATE_DIALOGUE = "droprate_dialogue"
 DROPRATE_LABEL = "droprate_label"
 
-EVAL_NUM_EPOCHS = "evaluate_every_num_epochs"
-EVAL_NUM_EXAMPLES = "evaluate_on_num_examples"
+EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
+EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
 
 INTENT_CLASSIFICATION = "intent_classification"
 ENTITY_RECOGNITION = "entity_recognition"
-MASKED_LM = "masked_language_model"
+MASKED_LM = "use_masked_language_model"
 
-SPARSE_INPUT_DROPOUT = "sparse_input_dropout"
+SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
 
 RANKING_LENGTH = "ranking_length"
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 17ebcb124ab8..46940ade1be9 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -3,7 +3,28 @@
 import numpy as np
 from typing import Optional, Text, Dict, Any
 
-from rasa.utils.tensorflow.constants import SIMILARITY_TYPE, LOSS_TYPE
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_TEXT,
+    HIDDEN_LAYERS_SIZES_LABEL,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
+    DENSE_DIM,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    C2,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+    HIDDEN_LAYERS_SIZES_DIALOGUE,
+    DROPRATE_DIALOGUE,
+    DROPRATE_LABEL,
+)
+
 
 logger = logging.getLogger(__name__)
 
@@ -41,3 +62,54 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
             config[SIMILARITY_TYPE] = "cosine"
 
     return config
+
+
+def _replace_deprecated_option(
+    old_option: Text, new_option: Text, config: Dict[Text, Any]
+) -> Dict[Text, Any]:
+    if old_option in config:
+        logger.warning(
+            f"Option '{old_option}' got renamed to {new_option}. "
+            f"Please update your configuration file."
+        )
+        config[new_option] = config[old_option]
+
+    return config
+
+
+def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
+
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_pre_dial", HIDDEN_LAYERS_SIZES_DIALOGUE, config
+    )
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_bot", HIDDEN_LAYERS_SIZES_LABEL, config
+    )
+    config = _replace_deprecated_option("droprate_a", DROPRATE_DIALOGUE, config)
+    config = _replace_deprecated_option("droprate_b", DROPRATE_LABEL, config)
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_a", HIDDEN_LAYERS_SIZES_TEXT, config
+    )
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_b", HIDDEN_LAYERS_SIZES_LABEL, config
+    )
+    config = _replace_deprecated_option(
+        "num_transformer_layers", NUM_TRANSFORMER_LAYERS, config
+    )
+    config = _replace_deprecated_option("num_heads", NUM_HEADS, config)
+    config = _replace_deprecated_option("max_seq_length", MAX_SEQ_LENGTH, config)
+    config = _replace_deprecated_option("dense_dim", DENSE_DIM, config)
+    config = _replace_deprecated_option("embed_dim", EMBED_DIM, config)
+    config = _replace_deprecated_option("num_neg", NUM_NEG, config)
+    config = _replace_deprecated_option("mu_pos", MU_POS, config)
+    config = _replace_deprecated_option("mu_neg", MU_NEG, config)
+    config = _replace_deprecated_option("use_max_sim_neg", USE_MAX_SIM_NEG, config)
+    config = _replace_deprecated_option("C2", C2, config)
+    config = _replace_deprecated_option(
+        "evaluate_every_num_epochs", EVAL_NUM_EPOCHS, config
+    )
+    config = _replace_deprecated_option(
+        "evaluate_on_num_examples", EVAL_NUM_EXAMPLES, config
+    )
+
+    return config

From 12c10b8c24cbf738695033aabb7556352af7f5fc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 14:12:57 +0100
Subject: [PATCH 272/633] use utils deprecated options method in ted

---
 rasa/core/policies/ted_policy.py | 35 ++------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index f4a0beb6578a..557a97ea9386 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -166,44 +166,13 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
         self.config.update(kwargs)
 
+        self.config = train_utils.check_deprecated_options(self.config)
+
         self.config = train_utils.update_similarity_type(self.config)
 
         if self.config[EVAL_NUM_EPOCHS] < 1:
             self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
 
-        self._check_deprecated_options()
-
-    def _check_deprecated_options(self):
-        if "hidden_layers_sizes_pre_dial" in self.config:
-            logger.warning(
-                f"Option 'hidden_layers_sizes_pre_dial' got renamed to"
-                f" {HIDDEN_LAYERS_SIZES_DIALOGUE}. Please update your configuration "
-                f"file."
-            )
-            self.config[HIDDEN_LAYERS_SIZES_DIALOGUE] = self.config[
-                "hidden_layers_sizes_pre_dial"
-            ]
-        if "hidden_layers_sizes_bot" in self.config:
-            logger.warning(
-                f"Option 'hidden_layers_sizes_bot' got renamed to "
-                f"{HIDDEN_LAYERS_SIZES_LABEL}. Please update your configuration file."
-            )
-            self.config[HIDDEN_LAYERS_SIZES_LABEL] = self.config[
-                "hidden_layers_sizes_bot"
-            ]
-        if "droprate_a" in self.config:
-            logger.warning(
-                f"Option 'droprate_a' got renamed to {DROPRATE_DIALOGUE}. Please "
-                f"update your configuration file."
-            )
-            self.config[DROPRATE_DIALOGUE] = self.config["droprate_a"]
-        if "droprate_b" in self.config:
-            logger.warning(
-                f"Option 'droprate_b' got renamed to {DROPRATE_LABEL}. Please "
-                f"update your configuration file."
-            )
-            self.config[DROPRATE_LABEL] = self.config["droprate_b"]
-
     # data helpers
     # noinspection PyPep8Naming
     @staticmethod

From 3f9c4a18a442a3558467af08b5cc8008ea83bda7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 15:16:08 +0100
Subject: [PATCH 273/633] add featurizer to crf entity extractor

---
 rasa/nlu/classifiers/diet_classifier.py       |  61 ++++++--
 .../embedding_intent_classifier.py            |  23 ++-
 rasa/nlu/extractors/crf_entity_extractor.py   | 136 ++++++++++++++++--
 .../selectors/embedding_response_selector.py  |  19 ++-
 4 files changed, 216 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index d50d911e2a9d..f83af6497a2a 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -554,7 +554,7 @@ def _check_enough_labels(model_data: RasaModelData) -> bool:
     def train(
         self,
         training_data: TrainingData,
-        cfg: Optional[RasaNLUModelConfig] = None,
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
         """Train the embedding intent classifier on a data set."""
@@ -793,6 +793,33 @@ def load(
             )
             return cls(component_config=meta)
 
+        (
+            batch_tuple_sizes,
+            inv_label_dict,
+            inv_tag_dict,
+            label_data,
+            label_key,
+            meta,
+            model_data_example,
+            tf_model_file,
+        ) = cls._load_from_files(meta, model_dir)
+
+        meta = train_utils.update_similarity_type(meta)
+
+        model = cls._load_model(
+            inv_tag_dict, label_data, label_key, meta, model_data_example, tf_model_file
+        )
+
+        return cls(
+            component_config=meta,
+            inverted_label_dict=inv_label_dict,
+            inverted_tag_dict=inv_tag_dict,
+            model=model,
+            batch_tuple_sizes=batch_tuple_sizes,
+        )
+
+    @classmethod
+    def _load_from_files(cls, meta, model_dir):
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
@@ -817,7 +844,27 @@ def load(
         ) as f:
             batch_tuple_sizes = pickle.load(f)
 
-        meta = train_utils.update_similarity_type(meta)
+        return (
+            batch_tuple_sizes,
+            inv_label_dict,
+            inv_tag_dict,
+            label_data,
+            label_key,
+            meta,
+            model_data_example,
+            tf_model_file,
+        )
+
+    @classmethod
+    def _load_model(
+        cls,
+        inv_tag_dict,
+        label_data,
+        label_key,
+        meta,
+        model_data_example,
+        tf_model_file,
+    ):
 
         model = DIET.load(
             tf_model_file,
@@ -827,20 +874,16 @@ def load(
             inverted_tag_dict=inv_tag_dict,
             config=meta,
         )
+
         # build the graph for prediction
         predict_data_example = RasaModelData(
             label_key=label_key,
             data={k: vs for k, vs in model_data_example.items() if "text" in k},
         )
+
         model.build_for_predict(predict_data_example)
 
-        return cls(
-            component_config=meta,
-            inverted_label_dict=inv_label_dict,
-            inverted_tag_dict=inv_tag_dict,
-            model=model,
-            batch_tuple_sizes=batch_tuple_sizes,
-        )
+        return model
 
 
 # pytype: disable=key-error
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7c83196d9b0a..75a4a0af95fe 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,6 +1,9 @@
 import logging
+import os
+import warnings
 from typing import Any, Dict, Optional, Text
 
+from rasa.nlu.model import Metadata
 from rasa.constants import DOCS_BASE_URL
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -47,6 +50,8 @@
     BILOU_FLAG,
 )
 from rasa.utils.common import raise_warning
+from rasa.utils.tensorflow.tf_models import RasaModel
+from rasa.utils import train_utils
 
 logger = logging.getLogger(__name__)
 
@@ -136,13 +141,27 @@ class EmbeddingIntentClassifier(DIETClassifier):
     }
     # end default properties (DOC MARKER - don't remove)
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+    ) -> None:
+
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
         component_config[BILOU_FLAG] = False
 
-        super().__init__(component_config)
+        super().__init__(
+            component_config,
+            inverted_label_dict,
+            inverted_tag_dict,
+            model,
+            batch_tuple_sizes,
+        )
 
         raise_warning(
             f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead .",
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index a85fcb7f5306..1c596907bb1c 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -1,6 +1,14 @@
 import logging
+import os
+import warnings
 from typing import Any, Dict, Optional, Text
 
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
+    LexicalSyntacticFeaturizer,
+)
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import TrainingData, Message
 from rasa.constants import DOCS_BASE_URL
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -13,10 +21,7 @@
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_TEXT,
     SHARE_HIDDEN_LAYERS,
-    TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
-    NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -35,6 +40,7 @@
     BILOU_FLAG,
 )
 from rasa.utils.common import raise_warning
+from rasa.utils.tensorflow.tf_models import RasaModel
 
 logger = logging.getLogger(__name__)
 
@@ -51,6 +57,28 @@ class CRFEntityExtractor(DIETClassifier):
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
+        # 'features' is [before, word, after] array with before, word,
+        # after holding keys about which features to use for each word,
+        # for example, 'title' in array before will have the feature
+        # "is the preceding word in title case?"
+        # POS features require 'SpacyTokenizer'.
+        "features": [
+            ["low", "title", "upper"],
+            [
+                "BOS",
+                "EOS",
+                "low",
+                "prefix5",
+                "prefix2",
+                "suffix5",
+                "suffix3",
+                "suffix2",
+                "upper",
+                "title",
+                "digit",
+            ],
+            ["low", "title", "upper"],
+        ],
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
@@ -89,22 +117,112 @@ class CRFEntityExtractor(DIETClassifier):
     }
     # end default properties (DOC MARKER - don't remove)
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        featurizer: Optional[LexicalSyntacticFeaturizer] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+    ) -> None:
         component_config[INTENT_CLASSIFICATION] = False
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
-        component_config[TRANSFORMER_SIZE] = 128
         component_config[NUM_TRANSFORMER_LAYERS] = 0
-        component_config[NUM_HEADS] = 4
         component_config[SHARE_HIDDEN_LAYERS] = False
-        component_config[MAX_SEQ_LENGTH] = 256
         component_config[UNIDIRECTIONAL_ENCODER] = True
 
-        super().__init__(component_config)
+        super().__init__(
+            component_config,
+            inverted_label_dict,
+            inverted_tag_dict,
+            model,
+            batch_tuple_sizes,
+        )
+
+        self.featurizer = featurizer or LexicalSyntacticFeaturizer(
+            self.component_config
+        )
 
         raise_warning(
             f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in"
-            f"combination with the 'LexicalSyntacticFeaturizer'.",
+            f"combination with 'LexicalSyntacticFeaturizer'.",
             category=DeprecationWarning,
             docs=f"{DOCS_BASE_URL}/nlu/components/",
         )
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+
+        self.featurizer.train(training_data, **kwargs)
+
+        super().train(training_data, config, **kwargs)
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        self.featurizer.process(message, **kwargs)
+
+        super().process(message, **kwargs)
+
+    def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
+
+        self.featurizer.persist(file_name, model_dir)
+
+        return super().persist(file_name, model_dir)
+
+    @classmethod
+    def load(
+        cls,
+        meta: Dict[Text, Any],
+        model_dir: Text = None,
+        model_metadata: Metadata = None,
+        cached_component: Optional["CRFEntityExtractor"] = None,
+        **kwargs: Any,
+    ) -> "CRFEntityExtractor":
+
+        if not model_dir or not meta.get("file"):
+            warnings.warn(
+                f"Failed to load 'CRFEntityExtractor'. "
+                f"Maybe the path '{os.path.abspath(model_dir)}' doesn't exist?"
+            )
+            return cls(component_config=meta)
+
+        featurizer = LexicalSyntacticFeaturizer.load(
+            meta, model_dir, model_metadata, cached_component, **kwargs
+        )
+
+        (
+            batch_tuple_sizes,
+            inv_label_dict,
+            inv_tag_dict,
+            label_data,
+            label_key,
+            meta,
+            model_data_example,
+            tf_model_file,
+        ) = cls._load_from_files(meta, model_dir)
+
+        meta[INTENT_CLASSIFICATION] = False
+        meta[ENTITY_RECOGNITION] = True
+        meta[MASKED_LM] = False
+        meta[NUM_TRANSFORMER_LAYERS] = 0
+        meta[SHARE_HIDDEN_LAYERS] = False
+        meta[UNIDIRECTIONAL_ENCODER] = True
+
+        model = cls._load_model(
+            inv_tag_dict, label_data, label_key, meta, model_data_example, tf_model_file
+        )
+
+        return cls(
+            component_config=meta,
+            featurizer=featurizer,
+            inverted_label_dict=inv_label_dict,
+            inverted_tag_dict=inv_tag_dict,
+            model=model,
+            batch_tuple_sizes=batch_tuple_sizes,
+        )
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index f0f87c94bb1e..56a7ef7f1717 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -48,7 +48,7 @@
     SPARSE_FEATURE_NAMES,
 )
 from rasa.utils.tensorflow.tf_model_data import RasaModelData
-
+from rasa.utils.tensorflow.tf_models import RasaModel
 
 logger = logging.getLogger(__name__)
 
@@ -162,14 +162,27 @@ class ResponseSelector(DIETClassifier):
     }
     # end default properties (DOC MARKER - don't remove)
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+    ):
         # ResponseSelector should not be able to set the following properties
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
         component_config[BILOU_FLAG] = False
 
-        super().__init__(component_config)
+        super().__init__(
+            component_config,
+            inverted_label_dict,
+            inverted_tag_dict,
+            model,
+            batch_tuple_sizes,
+        )
 
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]

From 2f16d97ba06e02cdef3f0681f2c9ddfe4ee799ee Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 15:25:31 +0100
Subject: [PATCH 274/633] add types

---
 rasa/nlu/classifiers/diet_classifier.py     | 37 +++++++++------------
 rasa/nlu/extractors/crf_entity_extractor.py |  8 ++---
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index f83af6497a2a..e27da27b0e2f 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -780,7 +780,7 @@ def load(
         cls,
         meta: Dict[Text, Any],
         model_dir: Text = None,
-        model_metadata: "Metadata" = None,
+        model_metadata: Metadata = None,
         cached_component: Optional["DIETClassifier"] = None,
         **kwargs: Any,
     ) -> "DIETClassifier":
@@ -798,17 +798,13 @@ def load(
             inv_label_dict,
             inv_tag_dict,
             label_data,
-            label_key,
             meta,
-            model_data_example,
-            tf_model_file,
+            data_example,
         ) = cls._load_from_files(meta, model_dir)
 
         meta = train_utils.update_similarity_type(meta)
 
-        model = cls._load_model(
-            inv_tag_dict, label_data, label_key, meta, model_data_example, tf_model_file
-        )
+        model = cls._load_model(inv_tag_dict, label_data, meta, data_example, model_dir)
 
         return cls(
             component_config=meta,
@@ -819,14 +815,11 @@ def load(
         )
 
     @classmethod
-    def _load_from_files(cls, meta, model_dir):
+    def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
         file_name = meta.get("file")
-        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
-
-        label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else "tag_ids"
 
         with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "rb") as f:
-            model_data_example = RasaModelData(label_key=label_key, data=pickle.load(f))
+            data_example = pickle.load(f)
 
         with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
             label_data = pickle.load(f)
@@ -849,22 +842,24 @@ def _load_from_files(cls, meta, model_dir):
             inv_label_dict,
             inv_tag_dict,
             label_data,
-            label_key,
             meta,
-            model_data_example,
-            tf_model_file,
+            data_example,
         )
 
     @classmethod
     def _load_model(
         cls,
-        inv_tag_dict,
-        label_data,
-        label_key,
-        meta,
-        model_data_example,
-        tf_model_file,
+        inv_tag_dict: Dict[int, Text],
+        label_data: RasaModelData,
+        meta: Dict[Text, Any],
+        data_example: Dict[Text, List[np.ndarray]],
+        model_dir: Text,
     ):
+        file_name = meta.get("file")
+        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
+
+        label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else "tag_ids"
+        model_data_example = RasaModelData(label_key=label_key, data=data_example)
 
         model = DIET.load(
             tf_model_file,
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 1c596907bb1c..3e8f81910419 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -201,10 +201,8 @@ def load(
             inv_label_dict,
             inv_tag_dict,
             label_data,
-            label_key,
             meta,
-            model_data_example,
-            tf_model_file,
+            data_example,
         ) = cls._load_from_files(meta, model_dir)
 
         meta[INTENT_CLASSIFICATION] = False
@@ -214,9 +212,7 @@ def load(
         meta[SHARE_HIDDEN_LAYERS] = False
         meta[UNIDIRECTIONAL_ENCODER] = True
 
-        model = cls._load_model(
-            inv_tag_dict, label_data, label_key, meta, model_data_example, tf_model_file
-        )
+        model = cls._load_model(inv_tag_dict, label_data, meta, data_example, model_dir)
 
         return cls(
             component_config=meta,

From 2143580a832999c45b4c4a976141f2a89221eaea Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 15:43:22 +0100
Subject: [PATCH 275/633] update requires

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 8 +-------
 rasa/nlu/extractors/crf_entity_extractor.py         | 4 +++-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 75a4a0af95fe..d5b2df3b2438 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,14 +1,10 @@
 import logging
-import os
-import warnings
 from typing import Any, Dict, Optional, Text
 
-from rasa.nlu.model import Metadata
 from rasa.constants import DOCS_BASE_URL
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import (
-    TOKENS_NAMES,
     TEXT_ATTRIBUTE,
     ENTITIES_ATTRIBUTE,
     DENSE_FEATURE_NAMES,
@@ -51,7 +47,6 @@
 )
 from rasa.utils.common import raise_warning
 from rasa.utils.tensorflow.tf_models import RasaModel
-from rasa.utils import train_utils
 
 logger = logging.getLogger(__name__)
 
@@ -61,10 +56,9 @@ class EmbeddingIntentClassifier(DIETClassifier):
     provides = [ENTITIES_ATTRIBUTE]
 
     requires = [
-        TOKENS_NAMES[TEXT_ATTRIBUTE],
         any_of(
             DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        ),
+        )
     ]
 
     # default properties (DOC MARKER - don't remove)
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 3e8f81910419..fee1050ac080 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -17,6 +17,7 @@
     ENTITIES_ATTRIBUTE,
     DENSE_FEATURE_NAMES,
     SPARSE_FEATURE_NAMES,
+    TOKENS_NAMES,
 )
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_TEXT,
@@ -50,9 +51,10 @@ class CRFEntityExtractor(DIETClassifier):
     provides = [ENTITIES_ATTRIBUTE]
 
     requires = [
+        TOKENS_NAMES[TEXT_ATTRIBUTE],
         any_of(
             DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        )
+        ),
     ]
 
     # default properties (DOC MARKER - don't remove)

From b983d7dc64d60e455b7ae3027698d6a06ef4d153 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 3 Feb 2020 17:19:56 +0100
Subject: [PATCH 276/633] add relative attention

---
 rasa/core/policies/ted_policy.py        |   2 +
 rasa/nlu/classifiers/diet_classifier.py |   4 +-
 rasa/utils/tensorflow/layers.py         |   2 +-
 rasa/utils/tensorflow/models.py         |   2 +-
 rasa/utils/tensorflow/transformer.py    | 274 +++++++++++++++++++-----
 5 files changed, 230 insertions(+), 54 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index dc6a3a63780e..410547a1695b 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -506,6 +506,8 @@ def _prepare_layers(self) -> None:
             dropout_rate=self.config[DROPRATE_DIALOGUE],
             attention_dropout_rate=0,
             unidirectional=True,
+            use_key_relative_position=True,
+            max_relative_position=5,
             name="dialogue_encoder",
         )
         self._tf_layers["embed.dialogue"] = layers.Embed(
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 21423b1c5346..c51506aeb08e 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -959,8 +959,10 @@ def _prepare_sequence_layers(self) -> None:
                 self.config[MAX_SEQ_LENGTH],
                 self.config[C2],
                 dropout_rate=self.config[DROPRATE],
-                attention_dropout_rate=self.config[DROPRATE],
+                attention_dropout_rate=0,
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
+                use_key_relative_position=True,
+                max_relative_position=5,
                 name="text_encoder",
             )
             if self.config[NUM_TRANSFORMER_LAYERS] > 0
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 4c2cf593c2c1..6ef743dfb990 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -9,7 +9,7 @@
 
 class SparseDropout(tf.keras.layers.Dropout):
     def call(self, inputs: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
-        def dropped_inputs():
+        def dropped_inputs() -> tf.Tensor:
             to_retain_prob = tf.random.uniform(
                 tf.shape(inputs.values), 0, 1, inputs.values.dtype
             )
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index acde5e944bda..1cd5eabe6285 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -127,7 +127,7 @@ def train_on_batch(
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
-        self, predict_data: RasaModelData, eager: bool = False
+        self, predict_data: RasaModelData, eager: bool = True
     ) -> None:
         def predict_dataset_function(  # to reuse the same helper method
             _batch_size: Union[tf.Tensor, int]
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 9c5ce3960da0..42d29aca4bb8 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -1,6 +1,7 @@
 from typing import List, Optional, Text, Tuple, Callable
 import tensorflow as tf
 import tensorflow_addons as tfa
+from tensorflow.python.keras.utils import tf_utils
 import numpy as np
 from rasa.utils.tensorflow.layers import DenseWithSparseWeights
 
@@ -14,7 +15,8 @@ def __init__(
         num_heads: int,
         attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
-        use_relative_position: bool = False,
+        use_key_relative_position: bool = False,
+        use_value_relative_position: bool = False,
         max_relative_position: Optional[int] = None,
         heads_share_relative_embedding: bool = False,
     ) -> None:
@@ -22,9 +24,12 @@ def __init__(
 
         self.num_heads = num_heads
         self.d_model = d_model
+        self.attention_dropout_rate = attention_dropout_rate
         self.unidirectional = unidirectional
-        self.use_relative_position = use_relative_position
+        self.use_key_relative_position = use_key_relative_position
+        self.use_value_relative_position = use_value_relative_position
         self.max_relative_position = max_relative_position
+        self.heads_share_relative_embedding = heads_share_relative_embedding
 
         assert d_model % self.num_heads == 0
 
@@ -34,36 +39,52 @@ def __init__(
         self._wk = DenseWithSparseWeights(units=d_model, use_bias=False)
         self._wv = DenseWithSparseWeights(units=d_model, use_bias=False)
 
-        if use_relative_position:
-            if not max_relative_position:
+        self._dense = DenseWithSparseWeights(units=d_model)
+
+        self._add_relative_embeddings()
+
+    def _add_relative_embeddings(self) -> None:
+        """Instantiate relative embeddings."""
+
+        if self.use_key_relative_position or self.use_value_relative_position:
+            if not self.max_relative_position:
                 raise ValueError(
-                    f"Max relative position {max_relative_position} "
+                    f"Max relative position {self.max_relative_position} "
                     f"should be > 0 when using relative attention."
                 )
 
-            if unidirectional:
-                max_relative_position_unmasked = max_relative_position
+            if self.unidirectional:
+                max_relative_position_unmasked = self.max_relative_position
             else:
-                max_relative_position_unmasked = 2 * max_relative_position - 1
+                max_relative_position_unmasked = 2 * self.max_relative_position - 1
 
-            if heads_share_relative_embedding:
+            if self.heads_share_relative_embedding:
                 relative_embedding_shape = (max_relative_position_unmasked, self._depth)
             else:
                 relative_embedding_shape = (
-                    num_heads,
+                    self.num_heads,
                     max_relative_position_unmasked,
                     self._depth,
                 )
 
             initializer = tf.keras.initializers.TruncatedNormal(
-                stddev=self._depth ** -0.5
+                stddev=tf.math.sqrt(tf.cast(self._depth, tf.float32))
             )
+        else:
+            initializer = None
+            relative_embedding_shape = None
+
+        if self.use_key_relative_position:
             self.key_relative_embeddings = self.add_weight(
                 shape=relative_embedding_shape,
                 initializer=initializer,
                 trainable=True,
                 name="key_relative_embeddings",
             )
+        else:
+            self.key_relative_embeddings = None
+
+        if self.use_value_relative_position:
             self.value_relative_embeddings = self.add_weight(
                 shape=relative_embedding_shape,
                 initializer=initializer,
@@ -71,14 +92,145 @@ def __init__(
                 name="value_relative_embeddings",
             )
         else:
-            self.key_relative_embeddings = None
             self.value_relative_embeddings = None
 
-        self._attention_dropout = tf.keras.layers.Dropout(attention_dropout_rate)
+    def _pad_relative_embeddings(self, x, length):
+        # pad the left side to length
+        pad_left = x[:, :, :, :1, :]
+        pad_left = tf.tile(pad_left, (1, 1, 1, length - self.max_relative_position, 1))
 
-        self._dense = DenseWithSparseWeights(units=d_model)
+        # pad the right side to length
+        if self.unidirectional:
+            m_right = 1  # current time
+            pad_right = tf.zeros_like(x[:, :, :, -1:, :])
+        else:
+            m_right = self.max_relative_position
+            pad_right = x[:, :, :, -1:, :]
+        pad_right = tf.tile(pad_right, (1, 1, 1, length - m_right, 1))
+
+        return tf.concat([pad_left, x, pad_right], axis=-2)
+
+    def _slice_relative_embeddings(self, x, length):
+        if self.unidirectional:
+            # pad the right side to length
+            pad_right = tf.zeros_like(x[:, :, :, -1:, :])
+            pad_right = tf.tile(pad_right, (1, 1, 1, length - 1, 1))
+            x = tf.concat([x, pad_right], axis=-2)
+
+        dl = self.max_relative_position - length
+        m = tf.shape(x)[-2]
+        return x[:, :, :, dl : m - dl, :]
+
+    def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
+        """Universal method to convert tensor from relative to absolute indexing.
+
+        x.shape =
+        (batch, num_heads, length, relative_length, depth)
+        or (batch, num_heads, length, relative_length)
+        "Slides" relative embeddings by 45 degree """
+
+        x_dim = len(x.shape)
+
+        if x_dim < 4 or x_dim > 5:
+            raise ValueError("Relative tensor has a wrong shape.")
+        if x_dim == 4:
+            # add fake depth dimension
+            x = tf.expand_dims(x, axis=-1)
+
+        batch = tf.shape(x)[0]
+        num_heads = tf.shape(x)[1]
+        length = tf.shape(x)[2]
+        depth = tf.shape(x)[-1]
+
+        x = tf.cond(
+            length > self.max_relative_position,
+            lambda: self._pad_relative_embeddings(x, length),
+            lambda: self._slice_relative_embeddings(x, length),
+        )
+
+        # add a column of zeros to "slide" columns to diagonals through reshape
+        pad_shift = tf.zeros_like(x[:, :, :, -1:, :])
+        x = tf.concat([x, pad_shift], axis=-2)
+
+        # flatten length dimensions
+        x = tf.reshape(x, (batch, num_heads, -1, depth))
+        width = 2 * length
+
+        # add zeros so that the result of back reshape is still a matrix
+        pad_flat = tf.zeros_like(
+            x[:, :, : (width - 1) - width * length % (width - 1), :]
+        )
+        x = tf.concat([x, pad_flat], axis=-2)
+
+        # "slide" columns to diagonals through reshape
+        x = tf.reshape(x, (batch, num_heads, -1, width - 1, depth))
+
+        # slice needed "diagonal" matrix
+        x = x[:, :, :-1, -length:, :]
+
+        if x_dim == 4:
+            # remove fake depth dimension
+            x = tf.squeeze(x, axis=-1)
+
+        return x
+
+    def _matmul_with_relative_keys(self, x: tf.Tensor) -> tf.Tensor:
+        y = self.key_relative_embeddings
+
+        if self.heads_share_relative_embedding:
+            matmul = tf.einsum("bhld,md->bhlm", x, y)
+        else:
+            matmul = tf.einsum("bhld,hmd->bhlm", x, y)
 
-    def _scaled_dot_product_attention(self, q, k, v, pad_mask, training):
+        return self._relative_to_absolute_position(matmul)
+
+    def _tile_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
+        if self.heads_share_relative_embedding:
+            x = tf.expand_dims(x, axis=0)  # add head dimension
+
+        x = tf.expand_dims(x, axis=1)  # add length dimension
+        x = tf.tile(x, (1, length, 1, 1))
+        return tf.expand_dims(x, axis=0)  # add batch dimension
+
+    def _squeeze_relative_embeddings(self, x: tf.Tensor) -> tf.Tensor:
+        x = tf.squeeze(x, axis=0)  # squeeze batch dimension
+        if self.heads_share_relative_embedding:
+            x = tf.squeeze(x, axis=1)  # squeeze head dimension
+        return x
+
+    def _matmul_with_relative_values(self, x: tf.Tensor) -> tf.Tensor:
+        y = self._tile_relative_embeddings(
+            self.value_relative_embeddings, tf.shape(x)[-2]
+        )
+        y = self._relative_to_absolute_position(y)
+        y = self._squeeze_relative_embeddings(y)
+
+        if self.heads_share_relative_embedding:
+            return tf.einsum("bhlm,lmd->bhld", x, y)
+        else:
+            return tf.einsum("bhlm,hlmd->bhld", x, y)
+
+    def _drop_attention_logits(
+        self, logits: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor
+    ) -> tf.Tensor:
+        def droped_logits() -> tf.Tensor:
+            keep_prob = tf.random.uniform(tf.shape(logits), 0, 1) + pad_mask
+            drop_mask = tf.cast(
+                tf.less(keep_prob, self.attention_dropout_rate), logits.dtype
+            )
+
+            return logits + drop_mask * -1e9
+
+        return tf_utils.smart_cond(training, droped_logits, lambda: tf.identity(logits))
+
+    def _scaled_dot_product_attention(
+        self,
+        q: tf.Tensor,
+        k: tf.Tensor,
+        v: tf.Tensor,
+        pad_mask: tf.Tensor,
+        training: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Calculate the attention weights.
         q, k, v must have matching leading dimensions.
         k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
@@ -98,7 +250,8 @@ def _scaled_dot_product_attention(self, q, k, v, pad_mask, training):
 
         matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
 
-        # TODO add key relative embeddings
+        if self.use_key_relative_position:
+            matmul_qk += self._matmul_with_relative_keys(q)
 
         # scale matmul_qk
         dk = tf.cast(tf.shape(k)[-1], tf.float32)
@@ -108,19 +261,19 @@ def _scaled_dot_product_attention(self, q, k, v, pad_mask, training):
         if pad_mask is not None:
             logits += pad_mask * -1e9
 
+        # apply attention dropout before softmax to maintain attention_weights norm as 1
+        if self.attention_dropout_rate > 0:
+            logits = self._drop_attention_logits(logits, pad_mask, training)
+
         # softmax is normalized on the last axis (seq_len_k) so that the scores
         # add up to 1.
         attention_weights = tf.nn.softmax(
             logits, axis=-1
         )  # (..., seq_len_q, seq_len_k)
 
-        attention_weights = self._attention_dropout(
-            attention_weights, training=training
-        )
-
         output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
-
-        # TODO add value relative embedding to values
+        if self.use_value_relative_position:
+            output += self._matmul_with_relative_values(attention_weights)
 
         return output, attention_weights
 
@@ -188,12 +341,23 @@ def __init__(
         dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
+        use_key_relative_position: bool = False,
+        use_value_relative_position: bool = False,
+        max_relative_position: Optional[int] = None,
+        heads_share_relative_embedding: bool = False,
     ) -> None:
         super().__init__()
 
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
         self._mha = MultiHeadAttention(
-            d_model, num_heads, attention_dropout_rate, unidirectional
+            d_model,
+            num_heads,
+            attention_dropout_rate,
+            unidirectional,
+            use_key_relative_position,
+            use_value_relative_position,
+            max_relative_position,
+            heads_share_relative_embedding,
         )
         self._dropout = tf.keras.layers.Dropout(dropout_rate)
 
@@ -222,34 +386,6 @@ def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Ten
 
 
 class TransformerEncoder(tf.keras.layers.Layer):
-    @staticmethod
-    def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
-        pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
-        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
-
-    @staticmethod
-    def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
-        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
-        return pos * angle_dropout_rates
-
-    @classmethod
-    def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
-        angle_rads = cls._get_angles(
-            np.arange(max_position)[:, np.newaxis],
-            np.arange(d_model)[np.newaxis, :],
-            d_model,
-        )
-
-        # apply sin to even indices in the array; 2i
-        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
-
-        # apply cos to odd indices in the array; 2i+1
-        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
-
-        pos_encoding = angle_rads[np.newaxis, ...]
-
-        return tf.cast(pos_encoding, dtype=tf.float32)
-
     def __init__(
         self,
         num_layers: int,
@@ -261,6 +397,10 @@ def __init__(
         dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
+        use_key_relative_position: bool = True,
+        use_value_relative_position: bool = False,
+        max_relative_position: Optional[int] = 5,
+        heads_share_relative_embedding: bool = False,
         name: Optional[Text] = None,
     ) -> None:
         super().__init__(name=name)
@@ -285,11 +425,43 @@ def __init__(
                 dropout_rate,
                 attention_dropout_rate,
                 unidirectional,
+                use_key_relative_position,
+                use_value_relative_position,
+                max_relative_position,
+                heads_share_relative_embedding,
             )
             for _ in range(num_layers)
         ]
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
+    @staticmethod
+    def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
+        pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
+        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
+
+    @staticmethod
+    def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
+        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+        return pos * angle_dropout_rates
+
+    @classmethod
+    def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
+        angle_rads = cls._get_angles(
+            np.arange(max_position)[:, np.newaxis],
+            np.arange(d_model)[np.newaxis, :],
+            d_model,
+        )
+
+        # apply sin to even indices in the array; 2i
+        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+
+        # apply cos to odd indices in the array; 2i+1
+        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+
+        pos_encoding = angle_rads[np.newaxis, ...]
+
+        return tf.cast(pos_encoding, dtype=tf.float32)
+
     def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
 
         # adding embedding and position encoding.

From 44cd0ea973f1b82e5a42ce4549b59acdb0a5a1c1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 17:36:42 +0100
Subject: [PATCH 277/633] update tests

---
 rasa/nlu/classifiers/diet_classifier.py       |  15 +-
 rasa/nlu/extractors/crf_entity_extractor.py   |   3 +
 .../extractors/test_crf_entity_extractor.py   | 260 ------------------
 tests/nlu/training/test_train.py              |   1 +
 4 files changed, 12 insertions(+), 267 deletions(-)
 delete mode 100644 tests/nlu/extractors/test_crf_entity_extractor.py

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index e27da27b0e2f..b14dad5c7be8 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -973,13 +973,14 @@ def _prepare_sequence_layers(self) -> None:
         self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
             rate=self.config[DROPRATE]
         )
-        self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
-            self.data_signature["text_features"],
-            "text",
-            self.config[C2],
-            self.config[DENSE_DIM]["text"],
-        )
-        if self.config[INTENT_CLASSIFICATION]:
+        if "text_features" in self.data_signature:
+            self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
+                self.data_signature["text_features"],
+                "text",
+                self.config[C2],
+                self.config[DENSE_DIM]["text"],
+            )
+        if "label_features" in self.data_signature:
             self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
                 self.data_signature["label_features"],
                 "label",
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index fee1050ac080..0f2da69f4b78 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -161,6 +161,9 @@ def train(
         **kwargs: Any,
     ) -> None:
 
+        if not training_data.entity_examples:
+            return
+
         self.featurizer.train(training_data, **kwargs)
 
         super().train(training_data, config, **kwargs)
diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
deleted file mode 100644
index 8c832894fd25..000000000000
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ /dev/null
@@ -1,260 +0,0 @@
-from rasa.nlu.constants import TEXT_ATTRIBUTE
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.training_data import TrainingData, Message
-
-
-def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-    examples = [
-        Message(
-            "anywhere in the west",
-            {
-                "intent": "restaurant_search",
-                "entities": [
-                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
-                ],
-                "spacy_doc": spacy_nlp("anywhere in the west"),
-            },
-        ),
-        Message(
-            "central indian restaurant",
-            {
-                "intent": "restaurant_search",
-                "entities": [
-                    {
-                        "start": 0,
-                        "end": 7,
-                        "value": "central",
-                        "entity": "location",
-                        "extractor": "random_extractor",
-                    },
-                    {
-                        "start": 8,
-                        "end": 14,
-                        "value": "indian",
-                        "entity": "cuisine",
-                        "extractor": "CRFEntityExtractor",
-                    },
-                ],
-                "spacy_doc": spacy_nlp("central indian restaurant"),
-            },
-        ),
-    ]
-
-    # uses BILOU and the default features
-    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
-    sentence = "anywhere in the west"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    crf_format = ext._from_text_to_crf(Message(sentence, doc))
-    assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"]
-    feats = ext._sentence_to_features(crf_format)
-    assert "BOS" in feats[0]
-    assert "EOS" in feats[-1]
-    assert feats[1]["0:low"] == "in"
-    sentence = "anywhere in the west"
-    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
-    filtered = ext.filter_trainable_entities(examples)
-    assert filtered[0].get("entities") == [
-        {"start": 16, "end": 20, "value": "west", "entity": "location"}
-    ], "Entity without extractor remains"
-    assert filtered[1].get("entities") == [
-        {
-            "start": 8,
-            "end": 14,
-            "value": "indian",
-            "entity": "cuisine",
-            "extractor": "CRFEntityExtractor",
-        }
-    ], "Only CRFEntityExtractor entity annotation remains"
-    assert examples[1].get("entities")[0] == {
-        "start": 0,
-        "end": 7,
-        "value": "central",
-        "entity": "location",
-        "extractor": "random_extractor",
-    }, "Original examples are not mutated"
-
-
-def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-    sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    r = ext._from_crf_to_json(
-        Message(sentence, doc),
-        [
-            {"O": 1.0},
-            {"O": 1.0},
-            {"O": 1.0},
-            {"B-what": 1.0},
-            {"L-what": 1.0},
-            {"B-where": 1.0},
-            {"I-where": 1.0},
-            {"L-where": 1.0},
-        ],
-    )
-    assert len(r) == 2, "There should be two entities"
-
-    assert r[0]["confidence"]  # confidence should exist
-    del r[0]["confidence"]
-    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}
-
-    assert r[1]["confidence"]  # confidence should exist
-    del r[1]["confidence"]
-    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
-
-
-def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ner_crf_pos_feature_config.update({"BILOU_flag": False})
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-    sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    rs = ext._from_crf_to_json(
-        Message(sentence, doc),
-        [
-            {"O": 1.0},
-            {"O": 1.0},
-            {"O": 1.0},
-            {"what": 1.0},
-            {"what": 1.0},
-            {"where": 1.0},
-            {"where": 1.0},
-            {"where": 1.0},
-        ],
-    )
-
-    # non BILOU will split multi-word entities - hence 5
-    assert len(rs) == 5, "There should be five entities"
-
-    for r in rs:
-        assert r["confidence"]  # confidence should exist
-        del r["confidence"]
-
-    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
-    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
-    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
-    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
-    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
-
-
-def test_crf_create_entity_dict(spacy_nlp):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-
-    crf_extractor = CRFEntityExtractor()
-    spacy_tokenizer = SpacyTokenizer()
-    white_space_tokenizer = WhitespaceTokenizer()
-
-    examples = [
-        {
-            "message": Message(
-                "where is St. Michael's Hospital?",
-                {
-                    "intent": "search_location",
-                    "entities": [
-                        {
-                            "start": 9,
-                            "end": 31,
-                            "value": "St. Michael's Hospital",
-                            "entity": "hospital",
-                            "SpacyTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 5,
-                            },
-                            "WhitespaceTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 5,
-                            },
-                        }
-                    ],
-                    "spacy_doc": spacy_nlp("where is St. Michael's Hospital?"),
-                },
-            )
-        },
-        {
-            "message": Message(
-                "where is Children's Hospital?",
-                {
-                    "intent": "search_location",
-                    "entities": [
-                        {
-                            "start": 9,
-                            "end": 28,
-                            "value": "Children's Hospital",
-                            "entity": "hospital",
-                            "SpacyTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 4,
-                            },
-                            "WhitespaceTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 4,
-                            },
-                        }
-                    ],
-                    "spacy_doc": spacy_nlp("where is Children's Hospital?"),
-                },
-            )
-        },
-    ]
-    for ex in examples:
-        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
-        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT_ATTRIBUTE)
-        white_space_tokens = white_space_tokenizer.tokenize(
-            ex["message"], TEXT_ATTRIBUTE
-        )
-        for tokenizer, tokens in [
-            ("SpacyTokenizer", spacy_tokens),
-            ("WhitespaceTokenizer", white_space_tokens),
-        ]:
-            for entity in ex["message"].get("entities"):
-                parsed_entities = crf_extractor._create_entity_dict(
-                    ex["message"],
-                    tokens,
-                    entity[tokenizer]["entity_start_token_idx"],
-                    entity[tokenizer]["entity_end_token_idx"],
-                    entity["entity"],
-                    0.8,
-                )
-                assert parsed_entities == {
-                    "start": entity["start"],
-                    "end": entity["end"],
-                    "value": entity["value"],
-                    "entity": entity["entity"],
-                    "confidence": 0.8,
-                }
-
-
-def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-
-    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
-    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-
-    spacy_featurizer = SpacyFeaturizer()
-    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})
-
-    text = "Rasa is a company in Berlin"
-    message = Message(text)
-    message.set("spacy_doc", spacy_nlp(text))
-
-    white_space_tokenizer.process(message)
-    spacy_featurizer.process(message)
-
-    text_data = crf_extractor._from_text_to_crf(message)
-    features = crf_extractor._sentence_to_features(text_data)
-
-    assert "0:text_dense_features" in features[0]
-    for i in range(0, len(message.data.get("text_dense_features")[0])):
-        assert (
-            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
-            == message.data.get("text_dense_features")[0][i]
-        )
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index d88436564e2e..ceb6ce52d3f5 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -44,6 +44,7 @@ def pipelines_for_tests():
                 "EntitySynonymMapper",
                 "SklearnIntentClassifier",
                 "MitieIntentClassifier",
+                "EmbeddingIntentClassifier",
                 "DIETClassifier",
                 "KeywordIntentClassifier",
                 "ResponseSelector",

From 3a6b3f6442fd2a6f610f17f6ca53f68a0a40e480 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 3 Feb 2020 19:42:47 +0100
Subject: [PATCH 278/633] Fix type check.

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 5 ++++-
 rasa/nlu/extractors/crf_entity_extractor.py         | 3 +++
 rasa/nlu/selectors/embedding_response_selector.py   | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d5b2df3b2438..fd89d2a92496 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -144,6 +144,9 @@ def __init__(
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
 
+        component_config = component_config or {}
+
+        # the following properties are fixed for the EmbeddingIntentClassifier
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
@@ -158,7 +161,7 @@ def __init__(
         )
 
         raise_warning(
-            f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead .",
+            f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead.",
             category=DeprecationWarning,
             docs=f"{DOCS_BASE_URL}/nlu/components/",
         )
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 0f2da69f4b78..c37e9091229d 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -128,6 +128,9 @@ def __init__(
         model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
+        component_config = component_config or {}
+
+        # the following properties are fixed for the CRFEntityExtractor
         component_config[INTENT_CLASSIFICATION] = False
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 56a7ef7f1717..1b20938306a7 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -170,7 +170,9 @@ def __init__(
         model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ):
-        # ResponseSelector should not be able to set the following properties
+        component_config = component_config or {}
+
+        # the following properties are fixed for the ResponseSelector
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False

From 37a5790a037163079fa3f0a3ae8a9e25c0f5678e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 3 Feb 2020 23:44:17 +0100
Subject: [PATCH 279/633] clean rel attn

---
 rasa/core/policies/ted_policy.py        |  5 +++--
 rasa/nlu/classifiers/diet_classifier.py |  5 +++--
 rasa/utils/tensorflow/layers.py         | 15 +++------------
 rasa/utils/tensorflow/models.py         |  2 +-
 rasa/utils/tensorflow/transformer.py    | 19 ++++---------------
 5 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 410547a1695b..b98b65fd0ec4 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -506,8 +506,9 @@ def _prepare_layers(self) -> None:
             dropout_rate=self.config[DROPRATE_DIALOGUE],
             attention_dropout_rate=0,
             unidirectional=True,
-            use_key_relative_position=True,
-            max_relative_position=5,
+            use_key_relative_position=False,
+            use_value_relative_position=False,
+            max_relative_position=None,
             name="dialogue_encoder",
         )
         self._tf_layers["embed.dialogue"] = layers.Embed(
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c51506aeb08e..6e57a966e6a6 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -961,8 +961,9 @@ def _prepare_sequence_layers(self) -> None:
                 dropout_rate=self.config[DROPRATE],
                 attention_dropout_rate=0,
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
-                use_key_relative_position=True,
-                max_relative_position=5,
+                use_key_relative_position=False,
+                use_value_relative_position=False,
+                max_relative_position=None,
                 name="text_encoder",
             )
             if self.config[NUM_TRANSFORMER_LAYERS] > 0
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 6ef743dfb990..1d29c884cf35 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -144,12 +144,8 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
 
 class InputMask(tf.keras.layers.Layer):
     def build(self, input_shape: tf.TensorShape) -> None:
-        initializer = tf.keras.initializers.GlorotUniform()
         self.mask_vector = self.add_weight(
-            shape=(1, 1, input_shape[-1]),
-            initializer=initializer,
-            trainable=True,
-            name="mask_vector",
+            shape=(1, 1, input_shape[-1]), name="mask_vector",
         )
         self.built = True
 
@@ -198,14 +194,9 @@ class CRF(tf.keras.layers.Layer):
     def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
         super().__init__(name=name)
 
-        initializer = tf.keras.initializers.GlorotUniform()
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        regularizer = tf.keras.regularizers.l1(reg_lambda)
         self.transition_params = self.add_weight(
-            shape=(num_tags, num_tags),
-            initializer=initializer,
-            regularizer=l2_regularizer,
-            trainable=True,
-            name="transitions",
+            shape=(num_tags, num_tags), regularizer=regularizer, name="transitions",
         )
 
     def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 1cd5eabe6285..acde5e944bda 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -127,7 +127,7 @@ def train_on_batch(
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
-        self, predict_data: RasaModelData, eager: bool = True
+        self, predict_data: RasaModelData, eager: bool = False
     ) -> None:
         def predict_dataset_function(  # to reuse the same helper method
             _batch_size: Union[tf.Tensor, int]
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 42d29aca4bb8..43a5af44d886 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -66,30 +66,19 @@ def _add_relative_embeddings(self) -> None:
                     max_relative_position_unmasked,
                     self._depth,
                 )
-
-            initializer = tf.keras.initializers.TruncatedNormal(
-                stddev=tf.math.sqrt(tf.cast(self._depth, tf.float32))
-            )
         else:
-            initializer = None
             relative_embedding_shape = None
 
         if self.use_key_relative_position:
             self.key_relative_embeddings = self.add_weight(
-                shape=relative_embedding_shape,
-                initializer=initializer,
-                trainable=True,
-                name="key_relative_embeddings",
+                shape=relative_embedding_shape, name="key_relative_embeddings",
             )
         else:
             self.key_relative_embeddings = None
 
         if self.use_value_relative_position:
             self.value_relative_embeddings = self.add_weight(
-                shape=relative_embedding_shape,
-                initializer=initializer,
-                trainable=True,
-                name="value_relative_embeddings",
+                shape=relative_embedding_shape, name="value_relative_embeddings",
             )
         else:
             self.value_relative_embeddings = None
@@ -397,9 +386,9 @@ def __init__(
         dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
-        use_key_relative_position: bool = True,
+        use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
-        max_relative_position: Optional[int] = 5,
+        max_relative_position: Optional[int] = None,
         heads_share_relative_embedding: bool = False,
         name: Optional[Text] = None,
     ) -> None:

From 0639df26510ba12806777d76e86e8ee6fb871403 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 3 Feb 2020 23:52:02 +0100
Subject: [PATCH 280/633] add types

---
 rasa/utils/tensorflow/transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 43a5af44d886..4a32f7047dbd 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -83,7 +83,7 @@ def _add_relative_embeddings(self) -> None:
         else:
             self.value_relative_embeddings = None
 
-    def _pad_relative_embeddings(self, x, length):
+    def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
         # pad the left side to length
         pad_left = x[:, :, :, :1, :]
         pad_left = tf.tile(pad_left, (1, 1, 1, length - self.max_relative_position, 1))
@@ -99,7 +99,7 @@ def _pad_relative_embeddings(self, x, length):
 
         return tf.concat([pad_left, x, pad_right], axis=-2)
 
-    def _slice_relative_embeddings(self, x, length):
+    def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
         if self.unidirectional:
             # pad the right side to length
             pad_right = tf.zeros_like(x[:, :, :, -1:, :])

From 648713089ede4c7273cae24b7ed6abc1dfe08606 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 09:35:47 +0100
Subject: [PATCH 281/633] fix some tests

---
 rasa/nlu/classifiers/diet_classifier.py             |  6 ++----
 .../lexical_syntactic_featurizer.py                 | 10 ++++++++--
 rasa/utils/tensorflow/tf_model_data.py              |  2 +-
 sample_configs/config_crf_custom_features.yml       |  2 +-
 tests/nlu/conftest.py                               | 13 +------------
 tests/utils/test_tf_model_data.py                   | 10 +++++++++-
 6 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b14dad5c7be8..1bcdaf4dfeb2 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -616,8 +616,7 @@ def _predict_label(
 
         if self.model is None:
             logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
+                "There is no trained model: component is either not trained or "
                 "didn't receive enough training data."
             )
             return label, label_ranking
@@ -668,8 +667,7 @@ def _predict_entities(
     ) -> List[Dict]:
         if self.model is None:
             logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
+                "There is no trained model: component is either not trained or "
                 "didn't receive enough training data"
             )
             return []
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 261d1fa01209..5e8e1b5cb917 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -4,10 +4,10 @@
 import numpy as np
 import os
 import pickle
-import typing
 import scipy.sparse
 from typing import Any, Dict, Optional, Text, List
 
+from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.config import RasaNLUModelConfig
@@ -247,13 +247,19 @@ def _get_feature_value(
         token_idx: int,
         pointer_position: int,
         token_length: int,
-    ):
+    ) -> Any:
         if feature == "EOS":
             return token_idx + pointer_position == token_length - 1
 
         if feature == "BOS":
             return token_idx + pointer_position == 0
 
+        if feature not in self.function_dict:
+            raise ValueError(
+                f"Configured feature '{feature}' not valid. Please check "
+                f"'{DOCS_URL_COMPONENTS}' for valid configuration parameters."
+            )
+
         value = self.function_dict[feature](token)
         if value is None:
             logger.debug(
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index bd0f87f09cb5..edb65af10a52 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -167,7 +167,7 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
             raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
 
         # skip balancing if labels are token based
-        if data[self.label_key][0].size > 2:
+        if data[self.label_key][0][0].size > 1:
             return data
 
         label_ids = self._create_label_ids(data[self.label_key][0])
diff --git a/sample_configs/config_crf_custom_features.yml b/sample_configs/config_crf_custom_features.yml
index 70bdc09129e2..1301091a5a6c 100644
--- a/sample_configs/config_crf_custom_features.yml
+++ b/sample_configs/config_crf_custom_features.yml
@@ -15,7 +15,7 @@ pipeline:
     # features for word before token
     - ["low", "title", "upper", "pos", "pos2"]
     # features of token itself
-    - ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"]
+    - ["low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"]
     # features for word after the token we want to tag
     - ["low", "title", "upper", "pos", "pos2"]
   max_iterations: 50
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 21588dac0d2b..8327c4572774 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -39,18 +39,7 @@ def ner_crf_pos_feature_config():
     return {
         "features": [
             ["low", "title", "upper", "pos", "pos2"],
-            [
-                "bias",
-                "low",
-                "suffix3",
-                "suffix2",
-                "upper",
-                "title",
-                "digit",
-                "pos",
-                "pos2",
-                "pattern",
-            ],
+            ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
             ["low", "title", "upper", "pos", "pos2"],
         ]
     }
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index 17f58ed787dc..5f5d3009c634 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -153,12 +153,20 @@ def test_gen_batch(model_data: RasaModelData):
         next(iterator)
 
 
-def test_balance_session_data(model_data: RasaModelData):
+def test_balance_model_data(model_data: RasaModelData):
     data = model_data.balanced_data(model_data.data, 2, False)
 
     assert np.all(data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))
 
 
+def test_not_balance_model_data(model_data: RasaModelData):
+    test_model_data = RasaModelData(label_key="tag_ids", data=model_data.data)
+
+    data = test_model_data.balanced_data(test_model_data.data, 2, False)
+
+    assert np.all(data.get("tag_ids") == test_model_data.get("tag_ids"))
+
+
 def test_get_num_of_features(model_data: RasaModelData):
     num_features = model_data.get_feature_dimension("text_features")
 

From 0f5245029efdc850d6c5ed524f8a2e69b4a8c162 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 09:43:07 +0100
Subject: [PATCH 282/633] Update default values.

---
 rasa/nlu/classifiers/diet_classifier.py       |  4 ++--
 .../embedding_intent_classifier.py            | 19 +++----------------
 rasa/nlu/extractors/crf_entity_extractor.py   |  2 --
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 1bcdaf4dfeb2..3bedd9439293 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -163,7 +163,7 @@ class DIETClassifier(EntityExtractor):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: True,
+        UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -182,7 +182,7 @@ class DIETClassifier(EntityExtractor):
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
-        BILOU_FLAG: False,
+        BILOU_FLAG: True,
     }
     # end default properties (DOC MARKER - don't remove)
 
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index fd89d2a92496..3071a0ab04dc 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -14,10 +14,7 @@
     HIDDEN_LAYERS_SIZES_TEXT,
     HIDDEN_LAYERS_SIZES_LABEL,
     SHARE_HIDDEN_LAYERS,
-    TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
-    NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -34,7 +31,6 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    UNIDIRECTIONAL_ENCODER,
     DROPRATE,
     C_EMB,
     C2,
@@ -66,20 +62,12 @@ class EmbeddingIntentClassifier(DIETClassifier):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_TEXT: [],
+        HIDDEN_LAYERS_SIZES_TEXT: [256, 128],
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
         HIDDEN_LAYERS_SIZES_LABEL: [],
         # Whether to share the hidden layer weights between input words and labels
         SHARE_HIDDEN_LAYERS: False,
-        # number of units in transformer
-        TRANSFORMER_SIZE: 256,
-        # number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 2,
-        # number of attention heads in transformer
-        NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -123,10 +111,8 @@ class EmbeddingIntentClassifier(DIETClassifier):
         C_EMB: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
-        # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: True,
         # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: True,
+        SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -151,6 +137,7 @@ def __init__(
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
         component_config[BILOU_FLAG] = False
+        component_config[NUM_TRANSFORMER_LAYERS] = 0
 
         super().__init__(
             component_config,
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index c37e9091229d..d18198e37914 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -136,7 +136,6 @@ def __init__(
         component_config[MASKED_LM] = False
         component_config[NUM_TRANSFORMER_LAYERS] = 0
         component_config[SHARE_HIDDEN_LAYERS] = False
-        component_config[UNIDIRECTIONAL_ENCODER] = True
 
         super().__init__(
             component_config,
@@ -218,7 +217,6 @@ def load(
         meta[MASKED_LM] = False
         meta[NUM_TRANSFORMER_LAYERS] = 0
         meta[SHARE_HIDDEN_LAYERS] = False
-        meta[UNIDIRECTIONAL_ENCODER] = True
 
         model = cls._load_model(inv_tag_dict, label_data, meta, data_example, model_dir)
 

From 6f702d0015feb321233442b692d4a02296f213a5 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 09:48:31 +0100
Subject: [PATCH 283/633] Add EmbeddingPolicy

---
 rasa/core/policies/embedding_policy.py | 131 +++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 rasa/core/policies/embedding_policy.py

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
new file mode 100644
index 000000000000..c0dad1405c1e
--- /dev/null
+++ b/rasa/core/policies/embedding_policy.py
@@ -0,0 +1,131 @@
+import logging
+from typing import Any, Dict, Optional, Text
+
+from core.constants import DEFAULT_POLICY_PRIORITY
+from core.featurizers import TrackerFeaturizer
+from core.policies.ted_policy import TEDPolicy
+from rasa.constants import DOCS_BASE_URL
+from rasa.utils.tensorflow.constants import (
+    HIDDEN_LAYERS_SIZES_LABEL,
+    NUM_TRANSFORMER_LAYERS,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    C_EMB,
+    C2,
+    SCALE_LOSS,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+    HIDDEN_LAYERS_SIZES_DIALOGUE,
+    TRANSFORMER_SIZE,
+    MAX_SEQ_LENGTH,
+    NUM_HEADS,
+    DROPRATE_DIALOGUE,
+    DROPRATE_LABEL,
+)
+from rasa.utils.common import raise_warning
+from rasa.utils.tensorflow.tf_models import RasaModel
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingPolicy(TEDPolicy):
+    """Transformer Embedding Dialogue Policy (TEDP)
+
+    Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
+    """
+
+    SUPPORTS_ONLINE_TRAINING = True
+
+    # default properties (DOC MARKER - don't remove)
+    defaults = {
+        # nn architecture
+        # a list of hidden layers sizes before user embed layer
+        # number of hidden layers is equal to the length of this list
+        HIDDEN_LAYERS_SIZES_DIALOGUE: [],
+        # a list of hidden layers sizes before bot embed layer
+        # number of hidden layers is equal to the length of this list
+        HIDDEN_LAYERS_SIZES_LABEL: [],
+        # number of units in transformer
+        TRANSFORMER_SIZE: 128,
+        # number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 1,
+        # max sequence length if pos_encoding='emb'
+        MAX_SEQ_LENGTH: 256,
+        # number of attention heads in transformer
+        NUM_HEADS: 4,
+        # training parameters
+        # initial and final batch sizes:
+        # batch size will be linearly increased for each epoch
+        BATCH_SIZES: [8, 32],
+        # how to create batches
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        # number of epochs
+        EPOCHS: 1,
+        # set random seed to any int to get reproducible results
+        RANDOM_SEED: None,
+        # embedding parameters
+        # dimension size of embedding vectors
+        EMBED_DIM: 20,
+        # the type of the similarity
+        NUM_NEG: 20,
+        # flag if minimize only maximum similarity over incorrect labels
+        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
+        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
+        # number of top actions to normalize scores for softmax loss_type
+        # set to 0 to turn off normalization
+        RANKING_LENGTH: 10,
+        # how similar the algorithm should try
+        # to make embedding vectors for correct labels
+        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        # maximum negative similarity for incorrect labels
+        MU_NEG: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
+        # the number of incorrect labels, the algorithm will minimize
+        # their similarity to the user input during training
+        USE_MAX_SIM_NEG: True,  # flag which loss function to use
+        # scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # regularization
+        # the scale of L2 regularization
+        C2: 0.001,
+        # the scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels
+        C_EMB: 0.8,
+        # dropout rate for dial nn
+        DROPRATE_DIALOGUE: 0.1,
+        # dropout rate for bot nn
+        DROPRATE_LABEL: 0.0,
+        # visualization of accuracy
+        # how often calculate validation accuracy
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
+        # how many examples to use for hold out validation set
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+    }
+    # end default properties (DOC MARKER - don't remove)
+
+    def __init__(
+        self,
+        featurizer: Optional[TrackerFeaturizer] = None,
+        priority: int = DEFAULT_POLICY_PRIORITY,
+        max_history: Optional[int] = None,
+        model: Optional[RasaModel] = None,
+        **kwargs: Dict[Text, Any],
+    ) -> None:
+
+        super().__init__(featurizer, priority, max_history, model, **kwargs)
+
+        raise_warning(
+            f"'EmbeddingPolicy' is deprecated. Use 'TEDPolicy' instead.",
+            category=DeprecationWarning,
+            docs=f"{DOCS_BASE_URL}/core/policies/",
+        )

From ca823b9212aad8c3517fe2c8ec9b730f826f081d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 11:19:52 +0100
Subject: [PATCH 284/633] update default pipelines

---
 rasa/nlu/registry.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 256740ccd7bf..eba0bd50afdd 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -40,6 +40,11 @@
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.utils.common import class_from_module_path, raise_warning
+from rasa.utils.tensorflow.constants import (
+    INTENT_CLASSIFICATION,
+    ENTITY_RECOGNITION,
+    NUM_TRANSFORMER_LAYERS,
+)
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.components import Component
@@ -119,7 +124,13 @@
         {"name": "SpacyTokenizer"},
         {"name": "SpacyFeaturizer"},
         {"name": "RegexFeaturizer"},
-        {"name": "CRFEntityExtractor"},
+        {"name": "LexicalSyntacticFeaturizer"},
+        {
+            "name": "DIETClassifier",
+            INTENT_CLASSIFICATION: False,
+            ENTITY_RECOGNITION: True,
+            NUM_TRANSFORMER_LAYERS: 0,
+        },
         {"name": "EntitySynonymMapper"},
         {"name": "SklearnIntentClassifier"},
     ],
@@ -127,8 +138,7 @@
     "supervised_embeddings": [
         {"name": "WhitespaceTokenizer"},
         {"name": "RegexFeaturizer"},
-        {"name": "CRFEntityExtractor"},
-        {"name": "EntitySynonymMapper"},
+        {"name": "LexicalSyntacticFeaturizer"},
         {"name": "CountVectorsFeaturizer"},
         {
             "name": "CountVectorsFeaturizer",
@@ -136,12 +146,14 @@
             "min_ngram": 1,
             "max_ngram": 4,
         },
-        {"name": "EmbeddingIntentClassifier"},
+        {"name": "DIETClassifier"},
+        {"name": "EntitySynonymMapper"},
     ],
     "pretrained_embeddings_convert": [
         {"name": "ConveRTTokenizer"},
         {"name": "ConveRTFeaturizer"},
-        {"name": "EmbeddingIntentClassifier"},
+        {"name": "LexicalSyntacticFeaturizer"},
+        {"name": "DIETClassifier"},
     ],
 }
 

From 5162ff5dc508f043bc27d605379978838fc2af7a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 11:44:39 +0100
Subject: [PATCH 285/633] fix imports, docs, tests

---
 docs/core/policies.rst                      | 2 +-
 docs/nlu/components.rst                     | 2 +-
 examples/restaurantbot/config.yml           | 3 ---
 rasa/core/policies/embedding_policy.py      | 6 +++---
 rasa/nlu/extractors/crf_entity_extractor.py | 7 +------
 tests/nlu/utils/test_bilou_utils.py         | 6 +++---
 6 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 4d44423bbc13..ba06a147e62d 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -345,7 +345,7 @@ It is recommended to use
     These parameters can be specified in the policy configuration file.
     The default values are defined in ``EmbeddingPolicy.defaults``:
 
-    .. literalinclude:: ../../rasa/core/policies/TED_policy.py
+    .. literalinclude:: ../../rasa/core/policies/ted_policy.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index cfc81e29fd9f..c5f6c2a8a060 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -578,7 +578,7 @@ DIETClassifier
     In the config, you can specify these parameters.
     The default values are defined in ``DIETClassifier.defaults``:
 
-    .. literalinclude:: ../../rasa/nlu/classifiers/DIET_classifier.py
+    .. literalinclude:: ../../rasa/nlu/classifiers/diet_classifier.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index b06666b8f0dd..9bd0371277b3 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -9,7 +9,6 @@ pipeline:
     features: [
       ["low", "title", "upper"],
       [
-        "bias",
         "low",
         "prefix5",
         "prefix2",
@@ -19,8 +18,6 @@ pipeline:
         "upper",
         "title",
         "digit",
-        "pattern",
-        "text_dense_features"
       ],
       ["low", "title", "upper"],
     ]
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index c0dad1405c1e..b120a0745b8e 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1,9 +1,9 @@
 import logging
 from typing import Any, Dict, Optional, Text
 
-from core.constants import DEFAULT_POLICY_PRIORITY
-from core.featurizers import TrackerFeaturizer
-from core.policies.ted_policy import TEDPolicy
+from rasa.core.constants import DEFAULT_POLICY_PRIORITY
+from rasa.core.featurizers import TrackerFeaturizer
+from rasa.core.policies.ted_policy import TEDPolicy
 from rasa.constants import DOCS_BASE_URL
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_LABEL,
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index d18198e37914..9b4da1960813 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -50,12 +50,7 @@ class CRFEntityExtractor(DIETClassifier):
 
     provides = [ENTITIES_ATTRIBUTE]
 
-    requires = [
-        TOKENS_NAMES[TEXT_ATTRIBUTE],
-        any_of(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        ),
-    ]
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
index c65255128d5e..273006cc42fa 100644
--- a/tests/nlu/utils/test_bilou_utils.py
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -1,9 +1,9 @@
 import pytest
 
 import rasa.nlu.utils.bilou_utils as bilou_utils
-from nlu.constants import BILOU_ENTITIES_ATTRIBUTE, ENTITIES_ATTRIBUTE
-from nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from nlu.training_data import TrainingData
+from rasa.nlu.constants import BILOU_ENTITIES_ATTRIBUTE, ENTITIES_ATTRIBUTE
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.training_data import TrainingData
 from rasa.nlu.training_data import Message
 
 

From 8a84ef0085017e74eccc250202908c40c67bf70a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 11:57:31 +0100
Subject: [PATCH 286/633] add log message

---
 rasa/utils/tensorflow/tf_model_data.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index edb65af10a52..ffd804dc1e79 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -1,3 +1,5 @@
+import logging
+
 import numpy as np
 import scipy.sparse
 import tensorflow as tf
@@ -7,6 +9,9 @@
 from collections import defaultdict
 
 
+logger = logging.getLogger(__name__)
+
+
 Data = Optional[Dict[Text, List[np.ndarray]]]
 
 
@@ -168,6 +173,9 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
 
         # skip balancing if labels are token based
         if data[self.label_key][0][0].size > 1:
+            logger.debug(
+                f"Skip balancing data for '{self.label_key}' as data is a sequence."
+            )
             return data
 
         label_ids = self._create_label_ids(data[self.label_key][0])

From 5a71c57f1aa0d74b48d2481829d0c59548d4f436 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 13:21:22 +0100
Subject: [PATCH 287/633] first implementation ready.

---
 rasa/nlu/constants.py                         |   4 +
 .../dense_featurizer/lm_featurizer.py         |  64 ++++
 rasa/nlu/registry.py                          |   6 +
 rasa/nlu/tokenizers/lm_tokenizer.py           |  35 ++
 rasa/nlu/utils/hugging_face/__init__.py       |   0
 .../nlu/utils/hugging_face/hf_transformers.py | 347 ++++++++++++++++++
 rasa/nlu/utils/hugging_face/registry.py       |  65 ++++
 .../transformers_pre_post_processors.py       | 122 ++++++
 8 files changed, 643 insertions(+)
 create mode 100644 rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
 create mode 100644 rasa/nlu/tokenizers/lm_tokenizer.py
 create mode 100644 rasa/nlu/utils/hugging_face/__init__.py
 create mode 100644 rasa/nlu/utils/hugging_face/hf_transformers.py
 create mode 100644 rasa/nlu/utils/hugging_face/registry.py
 create mode 100644 rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 534ba66d87b6..b22cd0d4ecf6 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -36,6 +36,10 @@
 }
 
 SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
+TRANSFORMERS_DOCS = {
+    TEXT_ATTRIBUTE: "text_transformers_doc",
+    RESPONSE_ATTRIBUTE: "response_transformers_doc",
+}
 
 DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
new file mode 100644
index 000000000000..02a0091d8595
--- /dev/null
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -0,0 +1,64 @@
+import numpy as np
+import typing
+from typing import Any, Optional, Text
+
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.training_data import Message, TrainingData
+
+if typing.TYPE_CHECKING:
+    from spacy.tokens import Doc
+
+from rasa.nlu.constants import (
+    TEXT_ATTRIBUTE,
+    TRANSFORMERS_DOCS,
+    DENSE_FEATURE_NAMES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    TOKENS_NAMES,
+)
+
+
+class LanguageModelFeaturizer(Featurizer):
+
+    provides = [
+        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    requires = [
+        TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig],
+        **kwargs: Any,
+    ) -> None:
+
+        for example in training_data.intent_examples:
+            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
+                self._set_lm_features(example, attribute)
+
+    def get_doc(self, message: Message, attribute: Text) -> Any:
+
+        return message.get(TRANSFORMERS_DOCS[attribute])
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        self._set_lm_features(message)
+
+    def _set_lm_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
+        """Adds the precomputed word vectors to the messages features."""
+
+        message_attribute_doc = self.get_doc(message, attribute)
+
+        if message_attribute_doc is not None:
+            sequence_features = message_attribute_doc["sequence_features"]
+            sentence_features = message_attribute_doc["sentence_features"]
+
+            features = np.concatenate([sequence_features, sentence_features])
+
+            features = self._combine_with_existing_dense_features(
+                message, features, DENSE_FEATURE_NAMES[attribute]
+            )
+            message.set(DENSE_FEATURE_NAMES[attribute], features)
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 3131a1598da3..ad9ba1944809 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -28,6 +28,7 @@
 from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
     CountVectorsFeaturizer,
 )
+from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.selectors.embedding_response_selector import ResponseSelector
@@ -36,8 +37,10 @@
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.utils.hugging_face.hf_transformers import HuggingFaceTransformers
 from rasa.utils.common import class_from_module_path, raise_warning
 
 if typing.TYPE_CHECKING:
@@ -53,12 +56,14 @@
     # utils
     SpacyNLP,
     MitieNLP,
+    HuggingFaceTransformers,
     # tokenizers
     MitieTokenizer,
     SpacyTokenizer,
     WhitespaceTokenizer,
     ConveRTTokenizer,
     JiebaTokenizer,
+    LanguageModelTokenizer,
     # extractors
     SpacyEntityExtractor,
     MitieEntityExtractor,
@@ -72,6 +77,7 @@
     LexicalSyntacticFeaturizer,
     CountVectorsFeaturizer,
     ConveRTFeaturizer,
+    LanguageModelFeaturizer,
     # classifiers
     SklearnIntentClassifier,
     MitieIntentClassifier,
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
new file mode 100644
index 000000000000..cd951fdb3602
--- /dev/null
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -0,0 +1,35 @@
+import typing
+from typing import Text, List, Any, Dict
+
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.training_data import Message
+
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    TRANSFORMERS_DOCS,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+)
+
+
+class LanguageModelTokenizer(Tokenizer):
+
+    provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+
+    requires = [
+        TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # Flag to check whether to split intents
+        "intent_tokenization_flag": False,
+        # Symbol on which intent should be split
+        "intent_split_symbol": "_",
+    }
+
+    def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
+        return message.get(TRANSFORMERS_DOCS[attribute])
+
+    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
+        doc = self.get_doc(message, attribute)
+
+        return doc["tokens"]
diff --git a/rasa/nlu/utils/hugging_face/__init__.py b/rasa/nlu/utils/hugging_face/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
new file mode 100644
index 000000000000..3f641c7a63b7
--- /dev/null
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -0,0 +1,347 @@
+import logging
+import typing
+from typing import Any, Dict, List, Text, Tuple
+
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.tokenizers.tokenizer import Token
+import numpy as np
+
+from rasa.nlu.utils.hugging_face.registry import (
+    model_class_dict,
+    model_tokenizer_dict,
+    model_weights_defaults,
+    model_special_tokens_pre_processors,
+    model_embeddings_post_processors,
+)
+
+logger = logging.getLogger(__name__)
+
+if typing.TYPE_CHECKING:
+    from transformers import *
+
+from rasa.nlu.constants import (
+    TEXT_ATTRIBUTE,
+    TRANSFORMERS_DOCS,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+)
+
+
+class HuggingFaceTransformers(Component):
+    provides = [
+        TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # name of the language model to load.
+        "model_name": None,
+        # Pre-Trained weights to be loaded(string)
+        "model_weights": None,
+    }
+
+    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
+
+        super(HuggingFaceTransformers, self).__init__(component_config)
+
+        self._load_model()
+        self.whitespace_tokenizer = WhitespaceTokenizer()
+
+    def _load_model(self) -> None:
+        """Try loading the model"""
+        # import transformers
+
+        self.model_name = self.component_config["model_name"]
+
+        if self.model_name not in model_class_dict:
+            logger.error(
+                f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))}"
+            )
+            raise
+
+        self.model_weights = self.component_config["model_weights"]
+
+        if not self.model_weights:
+            logger.info(
+                f"Model weights not specified. Will choose default model weights: {model_weights_defaults[self.model_name]}"
+            )
+            self.model_weights = model_weights_defaults[self.model_name]
+
+        logger.info("Loading Tokenizer and Model for {}".format(self.model_name))
+        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
+            self.model_weights
+        )
+        self.model = model_class_dict[self.model_name].from_pretrained(
+            self.model_weights
+        )
+
+        # TODO
+        self.pad_token = ...
+
+    @classmethod
+    def required_packages(cls) -> List[Text]:
+        return ["transformers"]
+
+    def _lm_tokenize(self, text: Text) -> Any:
+
+        split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+
+        split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids)
+
+        return split_token_ids, split_token_strings
+
+    def _add_lm_specific_special_tokens(
+        self, token_ids: List[List[int]]
+    ) -> List[List[int]]:
+
+        augmented_tokens = [
+            model_special_tokens_pre_processors[self.model_name](example_token_ids)
+            for example_token_ids in token_ids
+        ]
+        return augmented_tokens
+
+    def _post_process_sequence_embeddings(
+        self, sequence_embeddings: np.array
+    ) -> Tuple[np.array, np.array]:
+
+        sentence_embeddings = []
+        post_processed_sequence_embeddings = []
+
+        for example_embedding in sequence_embeddings:
+            (
+                example_sentence_embedding,
+                example_post_processed_embedding,
+            ) = model_embeddings_post_processors[self.model_name](example_embedding)
+
+            sentence_embeddings.append(example_sentence_embedding)
+            post_processed_sequence_embeddings.append(example_post_processed_embedding)
+
+        return (
+            np.array(sentence_embeddings),
+            np.array(post_processed_sequence_embeddings),
+        )
+
+    @staticmethod
+    def _align_tokens(tokens_in: List[Text], token_end: int, token_start: int):
+        """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.
+
+        As a language model might split a single word into multiple tokens, we need to make
+        sure that the start and end value of first and last sub-token matches the
+        start and end value of the token return by the WhitespaceTokenizer as the
+        entities are using those start and end values.
+        """
+
+        tokens_out = []
+
+        current_token_offset = token_start
+
+        for index, string in enumerate(tokens_in):
+            if index == 0:
+                if index == len(tokens_in) - 1:
+                    s_token_end = token_end
+                else:
+                    s_token_end = current_token_offset + len(string)
+                tokens_out.append(Token(string, token_start, end=s_token_end))
+            elif index == len(tokens_in) - 1:
+                tokens_out.append(Token(string, current_token_offset, end=token_end))
+            else:
+                tokens_out.append(
+                    Token(
+                        string,
+                        current_token_offset,
+                        end=current_token_offset + len(string),
+                    )
+                )
+
+            current_token_offset += len(string)
+
+        return tokens_out
+
+    def _tokenize_example(self, message: Message, attribute: Text):
+
+        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)
+
+        tokens_out = []
+
+        token_ids_out = []
+
+        for token in tokens_in:
+            token_start, token_end, token_text = token.start, token.end, token.text
+
+            # use lm specific tokenizer to further tokenize the text
+            split_token_ids, split_token_strings = self._lm_tokenize(token_text)[0]
+
+            token_ids_out += split_token_ids
+
+            _aligned_tokens = self._align_tokens(
+                split_token_strings, token_end, token_start
+            )
+            tokens_out += _aligned_tokens
+
+        return tokens_out, token_ids_out
+
+    def _get_token_ids_for_batch(
+        self, batch_examples: List[Message], attribute: Text
+    ) -> Tuple[List[List[Token]], List[List[int]]]:
+
+        batch_token_ids = []
+        batch_tokens = []
+        for example in batch_examples:
+
+            example_tokens, example_token_ids = self._tokenize_example(
+                example, attribute
+            )
+            batch_tokens.append(example_tokens)
+            batch_token_ids.append(example_token_ids)
+
+        return batch_tokens, batch_token_ids
+
+    @staticmethod
+    def _compute_attention_mask(actual_sequence_lengths):
+
+        attention_mask = []
+        max_seq_length = max(actual_sequence_lengths)
+        for index in range(len(actual_sequence_lengths)):
+            example_seq_length = actual_sequence_lengths[index]
+            attention_mask.append(
+                [1] * example_seq_length + [0] * (max_seq_length - example_seq_length)
+            )
+
+        attention_mask = np.array(attention_mask).astype(np.float32)
+
+        return attention_mask
+
+    def _add_padding_to_batch(self, batch_token_ids):
+        padded_token_ids = []
+        # Compute max length across examples
+        max_seq_len = 0
+        actual_sequence_lengths = []
+        for example_token_ids in batch_token_ids:
+            actual_sequence_lengths.append(len(example_token_ids))
+            max_seq_len = max(max_seq_len, len(example_token_ids))
+        # Add padding according to max_seq_len
+        # Some models don't contain pad token, we use unknown token as padding token.This doesn't affect the computation
+        # since we compute an attention mask anyways.
+        # pad_token_id = self.tokenizer.pad_token_id if self.contains_special_token else self.tokenizer.unk_token_id
+        for example_token_ids in batch_token_ids:
+            padded_token_ids.append(
+                example_token_ids
+                + [self.pad_token_id] * (max_seq_len - len(example_token_ids))
+            )
+        return actual_sequence_lengths, padded_token_ids
+
+    @staticmethod
+    def _extract_nonpadded_embeddings(embeddings, actual_sequence_lengths):
+
+        nonpadded_sequence_embeddings = []
+        for index, embedding in enumerate(embeddings):
+            unmasked_embedding = embedding[: actual_sequence_lengths[index]]
+            nonpadded_sequence_embeddings.append(unmasked_embedding)
+
+        return np.array(nonpadded_sequence_embeddings)
+
+    def _compute_batch_sequence_features(self, batch_attention_mask, padded_token_ids):
+        sequence_hidden_states, pooler_output = self.model(
+            padded_token_ids, attention_mask=batch_attention_mask
+        )
+        sequence_hidden_states = sequence_hidden_states.numpy()
+        return sequence_hidden_states
+
+    def _get_model_features_for_batch(
+        self, batch_token_ids: List[List[int]]
+    ) -> np.array:
+
+        # Let's first add tokenizer specific special tokens to all examples
+        batch_token_ids_augmented = self._add_lm_specific_special_tokens(
+            batch_token_ids
+        )
+
+        # Let's first add padding so that whole batch can be fed to the model
+        actual_sequence_lengths, padded_token_ids = self._add_padding_to_batch(
+            batch_token_ids_augmented
+        )
+
+        # Compute attention mask based on actual_sequence_length
+        batch_attention_mask = self._compute_attention_mask(actual_sequence_lengths)
+
+        # Get token level features from the model
+        sequence_hidden_states = self._compute_batch_sequence_features(
+            batch_attention_mask, padded_token_ids
+        )
+
+        # Extract features for only non-padding tokens
+        sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings(
+            sequence_hidden_states, actual_sequence_lengths
+        )
+
+        # Extract sentence level and post-processed features
+        (
+            sentence_embeddings,
+            sequence_final_embeddings,
+        ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings)
+
+        return sentence_embeddings, sequence_final_embeddings
+
+    def _get_docs_for_batch(
+        self, batch_examples: List[Message], attribute: Text
+    ) -> List[Dict[Text, Any]]:
+
+        batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
+            batch_examples, attribute
+        )
+
+        (
+            batch_sequence_features,
+            batch_sentence_features,
+        ) = self._get_model_features_for_batch(batch_token_ids)
+
+        # A doc consists of {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
+        batch_docs = []
+        for index in range(len(batch_examples)):
+            doc = {
+                "token_ids": batch_token_ids[index],
+                "tokens": batch_tokens[index],
+                "sequence_features": batch_sequence_features[index],
+                "sentence_features": batch_sentence_features[index],
+            }
+            batch_docs.append(doc)
+
+        return batch_docs
+
+    def train(
+        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+    ) -> None:
+
+        batch_size = 64
+
+        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
+
+            non_empty_examples = list(
+                filter(lambda x: x.get(attribute), training_data.training_examples)
+            )
+
+            batch_start_index = 0
+
+            while batch_start_index < len(non_empty_examples):
+
+                batch_end_index = min(
+                    batch_start_index + batch_size, len(non_empty_examples)
+                )
+                # Collect batch examples
+                batch_messages = non_empty_examples[batch_start_index:batch_end_index]
+
+                # Construct a doc with relevant features extracted(tokens, dense_features)
+                batch_docs = self._get_docs_for_batch(batch_messages, attribute)
+
+                for index, ex in enumerate(batch_messages):
+                    ex.set(TRANSFORMERS_DOCS[attribute], batch_docs[index])
+
+                batch_start_index += batch_size
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        message.set(
+            TRANSFORMERS_DOCS[TEXT_ATTRIBUTE],
+            self._get_docs_for_batch([message.get(TEXT_ATTRIBUTE)])[0],
+        )
diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
new file mode 100644
index 000000000000..a43b4ba1f386
--- /dev/null
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -0,0 +1,65 @@
+from transformers import (
+    TFBertModel,
+    TFOpenAIGPTModel,
+    TFGPT2Model,
+    TFXLNetModel,
+    TFXLMModel,
+    TFDistilBertModel,
+    TFRobertaModel,
+    BertTokenizer,
+    OpenAIGPTTokenizer,
+    GPT2Tokenizer,
+    XLNetTokenizer,
+    XLMTokenizer,
+    DistilBertTokenizer,
+    RobertaTokenizer,
+)
+from rasa.nlu.utils.hugging_face.transformers_pre_post_processors import *
+
+model_class_dict = {
+    "bert": TFBertModel,
+    "openaigpt": TFOpenAIGPTModel,
+    "gpt2": TFGPT2Model,
+    "xlnet": TFXLNetModel,
+    "xlm": TFXLMModel,
+    "distilbert": TFDistilBertModel,
+    "roberta": TFRobertaModel,
+}
+model_tokenizer_dict = {
+    "bert": BertTokenizer,
+    "openaigpt": OpenAIGPTTokenizer,
+    "gpt2": GPT2Tokenizer,
+    "xlnet": XLNetTokenizer,
+    "xlm": XLMTokenizer,
+    "distilbert": DistilBertTokenizer,
+    "roberta": RobertaTokenizer,
+}
+model_weights_defaults = {
+    "bert": "bert-base-uncased",
+    "openai-gpt": "openai-gpt",
+    "gpt2": "gpt2",
+    "xlnet": "xlnet-base-cased",
+    "xlm": "xlm-mlm-enfr-1024",
+    "distilbert": "distilbert-base-uncased",
+    "roberta": "roberta-base",
+}
+
+model_special_tokens_pre_processors = {
+    "bert": bert_tokens_pre_processor,
+    "openai-gpt": gpt_tokens_pre_processor,
+    "gpt2": gpt_tokens_pre_processor,
+    "xlnet": xlnet_tokens_pre_processor,
+    "xlm": xlm_tokens_pre_processor,
+    "distilbert": bert_tokens_pre_processor,
+    "roberta": roberta_tokens_pre_processor,
+}
+
+model_embeddings_post_processors = {
+    "bert": bert_embeddings_post_process,
+    "openai-gpt": gpt_embeddings_post_process,
+    "gpt2": gpt_embeddings_post_process,
+    "xlnet": xlnet_embeddings_post_process,
+    "xlm": xlm_embeddings_post_process,
+    "distilbert": bert_embeddings_post_process,
+    "roberta": roberta_embeddings_post_process,
+}
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
new file mode 100644
index 000000000000..b4de9856514e
--- /dev/null
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -0,0 +1,122 @@
+from typing import List, Tuple
+import numpy as np
+
+
+def bert_tokens_pre_processor(token_ids: List[int]):
+    """Add BERT style special tokens(CLS and SEP)"""
+
+    BERT_CLS_ID = 101
+    BERT_SEP_ID = 102
+
+    token_ids.insert(0, BERT_CLS_ID)
+    token_ids.append(BERT_SEP_ID)
+
+    return token_ids
+
+
+def gpt_tokens_pre_processor(token_ids: List[int]):
+
+    return token_ids
+
+
+def xlnet_tokens_pre_processor(token_ids: List[int]):
+    """Add XLNET style special tokens"""
+
+    XLNET_CLS_ID = 3
+    XLNET_SEP_ID = 4
+
+    token_ids.append(XLNET_SEP_ID)
+    token_ids.append(XLNET_CLS_ID)
+
+    return token_ids
+
+
+def roberta_tokens_pre_processor(token_ids: List[int]):
+    """Add RoBERTa style special tokens"""
+
+    ROBERTA_BEG_ID = 0
+    ROBERTA_END_ID = 2
+
+    token_ids.insert(0, ROBERTA_BEG_ID)
+    token_ids.append(ROBERTA_END_ID)
+
+    return token_ids
+
+
+def xlm_tokens_pre_processor(token_ids: List[int]):
+    """Add RoBERTa style special tokens"""
+
+    XLM_SEP_ID = 1
+
+    token_ids.insert(0, XLM_SEP_ID)
+    token_ids.append(XLM_SEP_ID)
+
+    return token_ids
+
+
+def bert_embeddings_post_process(
+    sequence_embeddings: np.array,
+) -> Tuple[np.array, np.array]:
+    """Post process embeddings from BERT by removing CLS and SEP embeddings and returning CLS
+
+    token embedding as sentence representation"""
+
+    sentence_embedding = sequence_embeddings[0]
+    post_processed_embedding = sequence_embeddings[1:-1]
+
+    return post_processed_embedding, sentence_embedding
+
+
+def gpt_embeddings_post_process(
+    sequence_embeddings: np.array,
+) -> Tuple[np.array, np.array]:
+    """Post process embeddings from GPT models by taking a mean over sequence embeddings and
+
+    returning that as sentence representation"""
+
+    sentence_embedding = np.mean(sequence_embeddings, axis=0)
+    post_processed_embedding = sequence_embeddings
+
+    return post_processed_embedding, sentence_embedding
+
+
+def xlnet_embeddings_post_process(
+    sequence_embeddings: np.array,
+) -> Tuple[np.array, np.array]:
+    """Post process embeddings from XLNet models by taking a mean over sequence embeddings and
+
+    returning that as sentence representation. Remove last two time steps corresponding to special tokens from the
+    sequence embeddings."""
+
+    post_processed_embedding = sequence_embeddings[:-2]
+    sentence_embedding = np.mean(post_processed_embedding, axis=0)
+
+    return post_processed_embedding, sentence_embedding
+
+
+def roberta_embeddings_post_process(
+    sequence_embeddings: np.array,
+) -> Tuple[np.array, np.array]:
+    """Post process embeddings from Roberta models by taking a mean over sequence embeddings and
+
+    returning that as sentence representation. Remove first and last time steps corresponding to special tokens from the
+    sequence embeddings."""
+
+    post_processed_embedding = sequence_embeddings[1:-1]
+    sentence_embedding = np.mean(post_processed_embedding, axis=0)
+
+    return post_processed_embedding, sentence_embedding
+
+
+def xlm_embeddings_post_process(
+    sequence_embeddings: np.array,
+) -> Tuple[np.array, np.array]:
+    """Post process embeddings from XLM models by taking a mean over sequence embeddings and
+
+    returning that as sentence representation. Remove first and last time steps corresponding to special tokens from the
+    sequence embeddings."""
+
+    post_processed_embedding = sequence_embeddings[1:-1]
+    sentence_embedding = np.mean(post_processed_embedding, axis=0)
+
+    return post_processed_embedding, sentence_embedding

From a69c4612fd016b84f0ccebc5e0b08af0512b1105 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 4 Feb 2020 13:23:05 +0100
Subject: [PATCH 288/633] add rel attn parameters constants

---
 rasa/core/policies/ted_policy.py              | 40 +++++++-----
 rasa/nlu/classifiers/diet_classifier.py       | 61 ++++++++++++-------
 .../selectors/embedding_response_selector.py  |  8 +--
 rasa/utils/tensorflow/constants.py            |  9 ++-
 rasa/utils/tensorflow/transformer.py          | 30 ++++-----
 5 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index b98b65fd0ec4..d3553eb0dd2c 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -42,8 +42,8 @@
     NUM_NEG,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    C_EMB,
-    C2,
+    NEG_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_SIM_NEG,
     MU_NEG,
@@ -52,6 +52,10 @@
     HIDDEN_LAYERS_SIZES_DIALOGUE,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
+    DROPRATE_ATTENTION,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
 )
 
 
@@ -119,19 +123,27 @@ class TEDPolicy(Policy):
         SCALE_LOSS: True,
         # regularization
         # the scale of L2 regularization
-        C2: 0.001,
+        REGULARIZATION_CONSTANT: 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different labels
-        C_EMB: 0.8,
+        NEG_MARGIN_SCALE: 0.8,
         # dropout rate for dial nn
         DROPRATE_DIALOGUE: 0.1,
         # dropout rate for bot nn
         DROPRATE_LABEL: 0.0,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for hold out validation set
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -479,7 +491,7 @@ def _prepare_layers(self) -> None:
             self.config[MU_POS],
             self.config[MU_NEG],
             self.config[USE_MAX_SIM_NEG],
-            self.config[C_EMB],
+            self.config[NEG_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
@@ -487,13 +499,13 @@ def _prepare_layers(self) -> None:
         self._tf_layers["ffnn.dialogue"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_DIALOGUE],
             self.config[DROPRATE_DIALOGUE],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             layer_name_suffix="dialogue",
         )
         self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE_LABEL],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             layer_name_suffix="label",
         )
         self._tf_layers["transformer"] = TransformerEncoder(
@@ -502,24 +514,24 @@ def _prepare_layers(self) -> None:
             self.config[NUM_HEADS],
             self.config[TRANSFORMER_SIZE] * 4,
             self.config[MAX_SEQ_LENGTH],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             dropout_rate=self.config[DROPRATE_DIALOGUE],
-            attention_dropout_rate=0,
+            attention_dropout_rate=self.config[DROPRATE_ATTENTION],
             unidirectional=True,
-            use_key_relative_position=False,
-            use_value_relative_position=False,
-            max_relative_position=None,
+            use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
+            use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
+            max_relative_position=self.config[MAX_RELATIVE_POSITION],
             name="dialogue_encoder",
         )
         self._tf_layers["embed.dialogue"] = layers.Embed(
             self.config[EMBED_DIM],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "dialogue",
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.label"] = layers.Embed(
             self.config[EMBED_DIM],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "label",
             self.config[SIMILARITY_TYPE],
         )
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 6e57a966e6a6..45ffe42a8bad 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -61,14 +61,18 @@
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
-    C_EMB,
-    C2,
+    NEG_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_SIM_NEG,
     MU_NEG,
     MU_POS,
     EMBED_DIM,
     BILOU_FLAG,
+    DROPRATE_ATTENTION,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
 )
 
 
@@ -160,12 +164,14 @@ class DIETClassifier(EntityExtractor):
         SCALE_LOSS: True,
         # regularization parameters
         # the scale of L2 regularization
-        C2: 0.002,
+        REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different labels
-        C_EMB: 0.8,
-        # dropout rate for rnn
+        NEG_MARGIN_SCALE: 0.8,
+        # dropout rate for encoder
         DROPRATE: 0.2,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # visualization of accuracy
@@ -185,6 +191,12 @@ class DIETClassifier(EntityExtractor):
         SPARSE_INPUT_DROPOUT: True,
         # if true BILOU schema is used for entities
         BILOU_FLAG: False,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -929,25 +941,25 @@ def _prepare_sequence_layers(self) -> None:
         self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
             self.data_signature["text_features"],
             "text",
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             self.config[DENSE_DIM]["text"],
         )
         self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
             self.data_signature["label_features"],
             "label",
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             self.config[DENSE_DIM]["label"],
         )
         self._tf_layers["ffnn.text"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_TEXT],
             self.config[DROPRATE],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
         )
         self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
         )
         self._tf_layers["transformer"] = (
@@ -957,13 +969,13 @@ def _prepare_sequence_layers(self) -> None:
                 self.config[NUM_HEADS],
                 self.config[TRANSFORMER_SIZE] * 4,
                 self.config[MAX_SEQ_LENGTH],
-                self.config[C2],
+                self.config[REGULARIZATION_CONSTANT],
                 dropout_rate=self.config[DROPRATE],
-                attention_dropout_rate=0,
+                attention_dropout_rate=self.config[DROPRATE_ATTENTION],
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
-                use_key_relative_position=False,
-                use_value_relative_position=False,
-                max_relative_position=None,
+                use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
+                use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
+                max_relative_position=self.config[MAX_RELATIVE_POSITION],
                 name="text_encoder",
             )
             if self.config[NUM_TRANSFORMER_LAYERS] > 0
@@ -974,13 +986,13 @@ def _prepare_mask_lm_layers(self) -> None:
         self._tf_layers["input_mask"] = layers.InputMask()
         self._tf_layers["embed.lm_mask"] = layers.Embed(
             self.config[EMBED_DIM],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "lm_mask",
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.golden_token"] = layers.Embed(
             self.config[EMBED_DIM],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "golden_token",
             self.config[SIMILARITY_TYPE],
         )
@@ -990,7 +1002,7 @@ def _prepare_mask_lm_layers(self) -> None:
             self.config[MU_POS],
             self.config[MU_NEG],
             self.config[USE_MAX_SIM_NEG],
-            self.config[C_EMB],
+            self.config[NEG_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
@@ -999,13 +1011,13 @@ def _prepare_mask_lm_layers(self) -> None:
     def _prepare_intent_classification_layers(self) -> None:
         self._tf_layers["embed.text"] = layers.Embed(
             self.config[EMBED_DIM],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "text",
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.label"] = layers.Embed(
             self.config[EMBED_DIM],
-            self.config[C2],
+            self.config[REGULARIZATION_CONSTANT],
             "label",
             self.config[SIMILARITY_TYPE],
         )
@@ -1015,7 +1027,7 @@ def _prepare_intent_classification_layers(self) -> None:
             self.config[MU_POS],
             self.config[MU_NEG],
             self.config[USE_MAX_SIM_NEG],
-            self.config[C_EMB],
+            self.config[NEG_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
@@ -1023,9 +1035,11 @@ def _prepare_intent_classification_layers(self) -> None:
 
     def _prepare_entity_recognition_layers(self) -> None:
         self._tf_layers["embed.logits"] = layers.Embed(
-            self._num_tags, self.config[C2], "logits"
+            self._num_tags, self.config[REGULARIZATION_CONSTANT], "logits"
+        )
+        self._tf_layers["crf"] = layers.CRF(
+            self._num_tags, self.config[REGULARIZATION_CONSTANT]
         )
-        self._tf_layers["crf"] = layers.CRF(self._num_tags, self.config[C2])
         self._tf_layers["crf_f1_score"] = tfa.metrics.F1Score(
             num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
             average="micro",
@@ -1066,7 +1080,8 @@ def _create_bow(
     ) -> tf.Tensor:
 
         x = self._combine_sparse_dense_features(features, mask, name, sparse_dropout)
-        return self._tf_layers[f"ffnn.{name}"](tf.reduce_sum(x, 1), self._training)
+        x = tf.reduce_sum(x, axis=1)
+        return self._tf_layers[f"ffnn.{name}"](x, self._training)
 
     def _create_sequence(
         self,
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 40c073966466..f08f70f36125 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -31,8 +31,8 @@
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
-    C_EMB,
-    C2,
+    NEG_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_SIM_NEG,
     MU_NEG,
@@ -143,10 +143,10 @@ class ResponseSelector(DIETClassifier):
         SCALE_LOSS: True,
         # regularization parameters
         # the scale of L2 regularization
-        C2: 0.002,
+        REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different intent labels
-        C_EMB: 0.8,
+        NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index b7b3c68c2a5a..312f97481001 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -30,9 +30,10 @@
 USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
 
 SCALE_LOSS = "scale_loss"
-C2 = "l2_regularization"
-C_EMB = "c_emb"
+REGULARIZATION_CONSTANT = "regularization_constant"
+NEG_MARGIN_SCALE = "neg_margin_scale"
 DROPRATE = "droprate"
+DROPRATE_ATTENTION = "droprate_attention"
 DROPRATE_DIALOGUE = "droprate_dialogue"
 DROPRATE_LABEL = "droprate_label"
 
@@ -48,3 +49,7 @@
 RANKING_LENGTH = "ranking_length"
 
 BILOU_FLAG = "BILOU_flag"
+
+KEY_RELATIVE_ATTENTION = "use_key_relative_attention"
+VALUE_RELATIVE_ATTENTION = "use_value_relative_attention"
+MAX_RELATIVE_POSITION = "max_relative_position"
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 4a32f7047dbd..d9bd09b70d7f 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -28,7 +28,9 @@ def __init__(
         self.unidirectional = unidirectional
         self.use_key_relative_position = use_key_relative_position
         self.use_value_relative_position = use_value_relative_position
-        self.max_relative_position = max_relative_position
+        self.relative_length = max_relative_position
+        if self.relative_length is not None:
+            self.relative_length += 1  # include current time
         self.heads_share_relative_embedding = heads_share_relative_embedding
 
         assert d_model % self.num_heads == 0
@@ -41,29 +43,29 @@ def __init__(
 
         self._dense = DenseWithSparseWeights(units=d_model)
 
-        self._add_relative_embeddings()
+        self._create_relative_embeddings()
 
-    def _add_relative_embeddings(self) -> None:
-        """Instantiate relative embeddings."""
+    def _create_relative_embeddings(self) -> None:
+        """Create relative embeddings."""
 
         if self.use_key_relative_position or self.use_value_relative_position:
-            if not self.max_relative_position:
+            if not self.relative_length:
                 raise ValueError(
-                    f"Max relative position {self.max_relative_position} "
+                    f"Max relative position {self.relative_length} "
                     f"should be > 0 when using relative attention."
                 )
 
             if self.unidirectional:
-                max_relative_position_unmasked = self.max_relative_position
+                relative_length = self.relative_length
             else:
-                max_relative_position_unmasked = 2 * self.max_relative_position - 1
+                relative_length = 2 * self.relative_length - 1
 
             if self.heads_share_relative_embedding:
-                relative_embedding_shape = (max_relative_position_unmasked, self._depth)
+                relative_embedding_shape = (relative_length, self._depth)
             else:
                 relative_embedding_shape = (
                     self.num_heads,
-                    max_relative_position_unmasked,
+                    relative_length,
                     self._depth,
                 )
         else:
@@ -86,14 +88,14 @@ def _add_relative_embeddings(self) -> None:
     def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
         # pad the left side to length
         pad_left = x[:, :, :, :1, :]
-        pad_left = tf.tile(pad_left, (1, 1, 1, length - self.max_relative_position, 1))
+        pad_left = tf.tile(pad_left, (1, 1, 1, length - self.relative_length, 1))
 
         # pad the right side to length
         if self.unidirectional:
             m_right = 1  # current time
             pad_right = tf.zeros_like(x[:, :, :, -1:, :])
         else:
-            m_right = self.max_relative_position
+            m_right = self.relative_length
             pad_right = x[:, :, :, -1:, :]
         pad_right = tf.tile(pad_right, (1, 1, 1, length - m_right, 1))
 
@@ -106,7 +108,7 @@ def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tens
             pad_right = tf.tile(pad_right, (1, 1, 1, length - 1, 1))
             x = tf.concat([x, pad_right], axis=-2)
 
-        dl = self.max_relative_position - length
+        dl = self.relative_length - length
         m = tf.shape(x)[-2]
         return x[:, :, :, dl : m - dl, :]
 
@@ -132,7 +134,7 @@ def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
         depth = tf.shape(x)[-1]
 
         x = tf.cond(
-            length > self.max_relative_position,
+            length > self.relative_length,
             lambda: self._pad_relative_embeddings(x, length),
             lambda: self._slice_relative_embeddings(x, length),
         )

From d5a1b85ccba12d7f7332dfbaad8133c3b63c2b3d Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 15:51:14 +0100
Subject: [PATCH 289/633] tested all available models. implementation works

---
 rasa/nlu/registry.py                          |  4 +--
 .../nlu/utils/hugging_face/hf_transformers.py | 34 +++++++++++++------
 rasa/nlu/utils/hugging_face/registry.py       | 26 +++++++-------
 .../transformers_pre_post_processors.py       | 10 +++---
 requirements.txt                              |  1 +
 setup.py                                      |  1 +
 6 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index ad9ba1944809..994d998a843c 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -40,7 +40,7 @@
 from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
-from rasa.nlu.utils.hugging_face.hf_transformers import HuggingFaceTransformers
+from rasa.nlu.utils.hugging_face.hf_transformers import HuggingFaceTransformersNLP
 from rasa.utils.common import class_from_module_path, raise_warning
 
 if typing.TYPE_CHECKING:
@@ -56,7 +56,7 @@
     # utils
     SpacyNLP,
     MitieNLP,
-    HuggingFaceTransformers,
+    HuggingFaceTransformersNLP,
     # tokenizers
     MitieTokenizer,
     SpacyTokenizer,
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 3f641c7a63b7..075d7b53540c 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -29,7 +29,7 @@
 )
 
 
-class HuggingFaceTransformers(Component):
+class HuggingFaceTransformersNLP(Component):
     provides = [
         TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
@@ -43,14 +43,13 @@ class HuggingFaceTransformers(Component):
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
-        super(HuggingFaceTransformers, self).__init__(component_config)
+        super(HuggingFaceTransformersNLP, self).__init__(component_config)
 
         self._load_model()
         self.whitespace_tokenizer = WhitespaceTokenizer()
 
     def _load_model(self) -> None:
         """Try loading the model"""
-        # import transformers
 
         self.model_name = self.component_config["model_name"]
 
@@ -58,7 +57,9 @@ def _load_model(self) -> None:
             logger.error(
                 f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))}"
             )
-            raise
+            raise KeyError(
+                f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))}"
+            )
 
         self.model_weights = self.component_config["model_weights"]
 
@@ -76,8 +77,11 @@ def _load_model(self) -> None:
             self.model_weights
         )
 
-        # TODO
-        self.pad_token = ...
+        # Use a universal pad token since all transformer architectures do not have a consistent token.
+        # Instead of pad_token_id we use unk_token_id because pad_token_id is not set for all architectures.
+        # We can't add a new token as well since vocabulary resizing is not yet supported for TF classes.
+        # Also, this does not hurt the model predictions since we use an attention mask while feeding input.
+        self.pad_token_id = self.tokenizer.unk_token_id
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -170,7 +174,7 @@ def _tokenize_example(self, message: Message, attribute: Text):
             token_start, token_end, token_text = token.start, token.end, token.text
 
             # use lm specific tokenizer to further tokenize the text
-            split_token_ids, split_token_strings = self._lm_tokenize(token_text)[0]
+            split_token_ids, split_token_strings = self._lm_tokenize(token_text)
 
             token_ids_out += split_token_ids
 
@@ -242,9 +246,15 @@ def _extract_nonpadded_embeddings(embeddings, actual_sequence_lengths):
         return np.array(nonpadded_sequence_embeddings)
 
     def _compute_batch_sequence_features(self, batch_attention_mask, padded_token_ids):
-        sequence_hidden_states, pooler_output = self.model(
-            padded_token_ids, attention_mask=batch_attention_mask
+
+        print(np.array(padded_token_ids).shape, np.array(batch_attention_mask).shape)
+        model_outputs = self.model(
+            np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
         )
+        sequence_hidden_states = model_outputs[
+            0
+        ]  # sequence hidden states is always the first output from all models
+
         sequence_hidden_states = sequence_hidden_states.numpy()
         return sequence_hidden_states
 
@@ -303,7 +313,9 @@ def _get_docs_for_batch(
                 "token_ids": batch_token_ids[index],
                 "tokens": batch_tokens[index],
                 "sequence_features": batch_sequence_features[index],
-                "sentence_features": batch_sentence_features[index],
+                "sentence_features": np.reshape(
+                    batch_sentence_features[index], (1, -1)
+                ),
             }
             batch_docs.append(doc)
 
@@ -343,5 +355,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set(
             TRANSFORMERS_DOCS[TEXT_ATTRIBUTE],
-            self._get_docs_for_batch([message.get(TEXT_ATTRIBUTE)])[0],
+            self._get_docs_for_batch([message], attribute=TEXT_ATTRIBUTE)[0],
         )
diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
index a43b4ba1f386..40a491c0fc09 100644
--- a/rasa/nlu/utils/hugging_face/registry.py
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -21,7 +21,7 @@
     "openaigpt": TFOpenAIGPTModel,
     "gpt2": TFGPT2Model,
     "xlnet": TFXLNetModel,
-    "xlm": TFXLMModel,
+    # "xlm": TFXLMModel, # Currently doesn't work because of a bug in transformers library https://github.com/huggingface/transformers/issues/2729
     "distilbert": TFDistilBertModel,
     "roberta": TFRobertaModel,
 }
@@ -30,36 +30,36 @@
     "openaigpt": OpenAIGPTTokenizer,
     "gpt2": GPT2Tokenizer,
     "xlnet": XLNetTokenizer,
-    "xlm": XLMTokenizer,
+    # "xlm": XLMTokenizer,
     "distilbert": DistilBertTokenizer,
     "roberta": RobertaTokenizer,
 }
 model_weights_defaults = {
     "bert": "bert-base-uncased",
-    "openai-gpt": "openai-gpt",
+    "openaigpt": "openai-gpt",
     "gpt2": "gpt2",
     "xlnet": "xlnet-base-cased",
-    "xlm": "xlm-mlm-enfr-1024",
+    # "xlm": "xlm-mlm-enfr-1024",
     "distilbert": "distilbert-base-uncased",
     "roberta": "roberta-base",
 }
 
 model_special_tokens_pre_processors = {
     "bert": bert_tokens_pre_processor,
-    "openai-gpt": gpt_tokens_pre_processor,
+    "openaigpt": gpt_tokens_pre_processor,
     "gpt2": gpt_tokens_pre_processor,
     "xlnet": xlnet_tokens_pre_processor,
-    "xlm": xlm_tokens_pre_processor,
+    # "xlm": xlm_tokens_pre_processor,
     "distilbert": bert_tokens_pre_processor,
     "roberta": roberta_tokens_pre_processor,
 }
 
 model_embeddings_post_processors = {
-    "bert": bert_embeddings_post_process,
-    "openai-gpt": gpt_embeddings_post_process,
-    "gpt2": gpt_embeddings_post_process,
-    "xlnet": xlnet_embeddings_post_process,
-    "xlm": xlm_embeddings_post_process,
-    "distilbert": bert_embeddings_post_process,
-    "roberta": roberta_embeddings_post_process,
+    "bert": bert_embeddings_post_processor,
+    "openaigpt": gpt_embeddings_post_processor,
+    "gpt2": gpt_embeddings_post_processor,
+    "xlnet": xlnet_embeddings_post_processor,
+    # "xlm": xlm_embeddings_post_processor,
+    "distilbert": bert_embeddings_post_processor,
+    "roberta": roberta_embeddings_post_processor,
 }
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
index b4de9856514e..c893ca4b6798 100644
--- a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -54,7 +54,7 @@ def xlm_tokens_pre_processor(token_ids: List[int]):
     return token_ids
 
 
-def bert_embeddings_post_process(
+def bert_embeddings_post_processor(
     sequence_embeddings: np.array,
 ) -> Tuple[np.array, np.array]:
     """Post process embeddings from BERT by removing CLS and SEP embeddings and returning CLS
@@ -67,7 +67,7 @@ def bert_embeddings_post_process(
     return post_processed_embedding, sentence_embedding
 
 
-def gpt_embeddings_post_process(
+def gpt_embeddings_post_processor(
     sequence_embeddings: np.array,
 ) -> Tuple[np.array, np.array]:
     """Post process embeddings from GPT models by taking a mean over sequence embeddings and
@@ -80,7 +80,7 @@ def gpt_embeddings_post_process(
     return post_processed_embedding, sentence_embedding
 
 
-def xlnet_embeddings_post_process(
+def xlnet_embeddings_post_processor(
     sequence_embeddings: np.array,
 ) -> Tuple[np.array, np.array]:
     """Post process embeddings from XLNet models by taking a mean over sequence embeddings and
@@ -94,7 +94,7 @@ def xlnet_embeddings_post_process(
     return post_processed_embedding, sentence_embedding
 
 
-def roberta_embeddings_post_process(
+def roberta_embeddings_post_processor(
     sequence_embeddings: np.array,
 ) -> Tuple[np.array, np.array]:
     """Post process embeddings from Roberta models by taking a mean over sequence embeddings and
@@ -108,7 +108,7 @@ def roberta_embeddings_post_process(
     return post_processed_embedding, sentence_embedding
 
 
-def xlm_embeddings_post_process(
+def xlm_embeddings_post_processor(
     sequence_embeddings: np.array,
 ) -> Tuple[np.array, np.array]:
     """Post process embeddings from XLM models by taking a mean over sequence embeddings and
diff --git a/requirements.txt b/requirements.txt
index f58d5afa0ddb..53803ac9534b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -63,3 +63,4 @@ tensorflow==2.1.0
 tensorflow_hub==0.7.0
 tensorflow-addons==0.7.0
 tensorflow-probability==0.7.0
+transformers==2.3.0
diff --git a/setup.py b/setup.py
index 264e63bf31ce..c698b60ad701 100644
--- a/setup.py
+++ b/setup.py
@@ -83,6 +83,7 @@
     "SQLAlchemy~=1.3.0",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
+    "transformers~=2.3.0",
 ]
 
 extras_requires = {

From 205c7bd63c5dd8fcaaab09821b8c84ac03218f1e Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 15:53:44 +0100
Subject: [PATCH 290/633] refactored class name

---
 rasa/nlu/registry.py                           | 4 ++--
 rasa/nlu/utils/hugging_face/hf_transformers.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 994d998a843c..a8e3f05878a3 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -40,7 +40,7 @@
 from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
-from rasa.nlu.utils.hugging_face.hf_transformers import HuggingFaceTransformersNLP
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.utils.common import class_from_module_path, raise_warning
 
 if typing.TYPE_CHECKING:
@@ -56,7 +56,7 @@
     # utils
     SpacyNLP,
     MitieNLP,
-    HuggingFaceTransformersNLP,
+    HFTransformersNLP,
     # tokenizers
     MitieTokenizer,
     SpacyTokenizer,
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 075d7b53540c..b4060d5d7870 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -29,7 +29,7 @@
 )
 
 
-class HuggingFaceTransformersNLP(Component):
+class HFTransformersNLP(Component):
     provides = [
         TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
@@ -43,7 +43,7 @@ class HuggingFaceTransformersNLP(Component):
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
-        super(HuggingFaceTransformersNLP, self).__init__(component_config)
+        super(HFTransformersNLP, self).__init__(component_config)
 
         self._load_model()
         self.whitespace_tokenizer = WhitespaceTokenizer()

From 7eb475caa540e66509a5b924908ec691a06b8593 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 15:59:01 +0100
Subject: [PATCH 291/633] remove print statement

---
 rasa/nlu/utils/hugging_face/hf_transformers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index b4060d5d7870..172418e3335d 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -247,7 +247,6 @@ def _extract_nonpadded_embeddings(embeddings, actual_sequence_lengths):
 
     def _compute_batch_sequence_features(self, batch_attention_mask, padded_token_ids):
 
-        print(np.array(padded_token_ids).shape, np.array(batch_attention_mask).shape)
         model_outputs = self.model(
             np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
         )

From 0bfaaa85a1d229623f72a983b5a6bca7d84edce9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 16:34:28 +0100
Subject: [PATCH 292/633] update tests

---
 rasa/core/policies/embedding_policy.py       |  2 +-
 rasa/core/policies/ted_policy.py             |  2 +-
 tests/nlu/base/test_config.py                |  4 ++--
 tests/nlu/base/test_evaluation.py            | 14 +++++++-------
 tests/nlu/extractors/test_entity_synonyms.py |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b120a0745b8e..5c650c976fd4 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -41,7 +41,7 @@
 class EmbeddingPolicy(TEDPolicy):
     """Transformer Embedding Dialogue Policy (TEDP)
 
-    Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
+    The policy used in our paper https://arxiv.org/abs/1910.00486
     """
 
     SUPPORTS_ONLINE_TRAINING = True
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 63cca332b124..dce22bd80109 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -59,7 +59,7 @@
 class TEDPolicy(Policy):
     """Transformer Embedding Dialogue Policy (TEDP)
 
-    Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
+    The policy used in our paper https://arxiv.org/abs/1910.00486
     """
 
     SUPPORTS_ONLINE_TRAINING = True
diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index a937d0dbf067..2daa0e3c9c17 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -63,10 +63,10 @@ def test_default_config_file():
 
 def test_set_attr_on_component():
     cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
-    cfg.set_component_attr(6, C=324)
+    cfg.set_component_attr(7, C=324)
 
     assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
-    assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
+    assert cfg.for_component(7) == {"name": "SklearnIntentClassifier", "C": 324}
 
 
 def test_override_defaults_supervised_embeddings_pipeline():
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 2c5a76a8c217..9b9abce08779 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -274,7 +274,7 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
         data, os.path.join(unpacked_trained_moodbot_path, "nlu"), errors=False
     )
     assert result.get("intent_evaluation")
-    assert result.get("entity_evaluation").get("CRFEntityExtractor")
+    assert result.get("entity_evaluation").get("DIETClassifier")
 
 
 def test_run_cv_evaluation():
@@ -292,12 +292,12 @@ def test_run_cv_evaluation():
     assert len(intent_results.test["Accuracy"]) == n_folds
     assert len(intent_results.test["Precision"]) == n_folds
     assert len(intent_results.test["F1-score"]) == n_folds
-    assert len(entity_results.train["CRFEntityExtractor"]["Accuracy"]) == n_folds
-    assert len(entity_results.train["CRFEntityExtractor"]["Precision"]) == n_folds
-    assert len(entity_results.train["CRFEntityExtractor"]["F1-score"]) == n_folds
-    assert len(entity_results.test["CRFEntityExtractor"]["Accuracy"]) == n_folds
-    assert len(entity_results.test["CRFEntityExtractor"]["Precision"]) == n_folds
-    assert len(entity_results.test["CRFEntityExtractor"]["F1-score"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
 
 
 def test_run_cv_evaluation_with_response_selector():
diff --git a/tests/nlu/extractors/test_entity_synonyms.py b/tests/nlu/extractors/test_entity_synonyms.py
index a6479b8f8b43..eb9e5a332e2e 100644
--- a/tests/nlu/extractors/test_entity_synonyms.py
+++ b/tests/nlu/extractors/test_entity_synonyms.py
@@ -4,7 +4,7 @@
 
 def test_unintentional_synonyms_capitalized(component_builder):
     _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    ner_syn = component_builder.create_component(_config.for_component(5), _config)
+    ner_syn = component_builder.create_component(_config.for_component(6), _config)
     examples = [
         Message(
             "Any Mexican restaurant will do",

From cc55dfc61483402b17dcc5d4cc7853cbd1fe7151 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 17:23:50 +0100
Subject: [PATCH 293/633] Apply suggestions from code review

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 rasa/nlu/utils/hugging_face/hf_transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 172418e3335d..8c70f88e33b5 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -69,7 +69,7 @@ def _load_model(self) -> None:
             )
             self.model_weights = model_weights_defaults[self.model_name]
 
-        logger.info("Loading Tokenizer and Model for {}".format(self.model_name))
+        logger.info(f"Loading Tokenizer and Model for {self.model_name}")
         self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
             self.model_weights
         )

From 6779189194c1088256732550cec90695f7ff9b5d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 4 Feb 2020 17:38:23 +0100
Subject: [PATCH 294/633] review comments

---
 rasa/nlu/classifiers/diet_classifier.py       | 58 ++++++++++++-------
 .../embedding_intent_classifier.py            |  4 +-
 rasa/nlu/extractors/crf_entity_extractor.py   |  6 +-
 rasa/nlu/registry.py                          |  4 +-
 .../selectors/embedding_response_selector.py  |  4 +-
 rasa/utils/tensorflow/constants.py            |  2 +-
 rasa/utils/tensorflow/tf_model_data.py        | 17 +++---
 7 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c43de8254c99..c030abba06c4 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -54,7 +54,7 @@
     SPARSE_INPUT_DROPOUT,
     MASKED_LM,
     ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
+    LABEL_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
@@ -173,7 +173,7 @@ class DIETClassifier(EntityExtractor):
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
         # model config
         # if true intent classification is trained and intent predicted
-        INTENT_CLASSIFICATION: True,
+        LABEL_CLASSIFICATION: True,
         # if true named entity recognition is trained and entities predicted
         ENTITY_RECOGNITION: True,
         # if true random tokens of the input message will be masked and the model
@@ -192,7 +192,7 @@ def _check_config_parameters(self) -> None:
             self.component_config
         )
 
-        if self.component_config[INTENT_CLASSIFICATION]:
+        if self.component_config[LABEL_CLASSIFICATION]:
             if (
                 self.component_config[SHARE_HIDDEN_LAYERS]
                 and self.component_config[HIDDEN_LAYERS_SIZES_TEXT]
@@ -247,7 +247,7 @@ def __init__(
         self.data_example = None
 
         self.label_key = (
-            "label_ids" if self.component_config[INTENT_CLASSIFICATION] else "tag_ids"
+            "label_ids" if self.component_config[LABEL_CLASSIFICATION] else "tag_ids"
         )
 
     # training data helpers:
@@ -531,7 +531,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         label_attribute = (
-            INTENT_ATTRIBUTE if self.component_config[INTENT_CLASSIFICATION] else None
+            INTENT_ATTRIBUTE if self.component_config[LABEL_CLASSIFICATION] else None
         )
 
         model_data = self._create_model_data(
@@ -566,7 +566,7 @@ def train(
 
         model_data = self.preprocess_train_data(training_data)
 
-        if self.component_config[INTENT_CLASSIFICATION]:
+        if self.component_config[LABEL_CLASSIFICATION]:
             possible_to_train = self._check_enough_labels(model_data)
 
             if not possible_to_train:
@@ -726,7 +726,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         out = self._predict(message)
 
-        if self.component_config[INTENT_CLASSIFICATION]:
+        if self.component_config[LABEL_CLASSIFICATION]:
             label, label_ranking = self._predict_label(out)
 
             message.set("intent", label, add_to_output=True)
@@ -856,7 +856,7 @@ def _load_model(
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else "tag_ids"
+        label_key = "label_ids" if meta[LABEL_CLASSIFICATION] else "tag_ids"
         model_data_example = RasaModelData(label_key=label_key, data=data_example)
 
         model = DIET.load(
@@ -917,6 +917,23 @@ def __init__(
 
         self.all_labels_embed = None  # needed for efficient prediction
 
+        self._check_data()
+
+    def _check_data(self):
+        if "text_features" not in self.data_signature:
+            raise ValueError(
+                "No text features specified. Cannot train 'DIETClassifier'."
+            )
+        if (
+            self.config[LABEL_CLASSIFICATION]
+            and "label_features" not in self.data_signature
+        ):
+            raise ValueError(
+                "No label features specified. Cannot train 'DIETClassifier'."
+            )
+        if self.config[ENTITY_RECOGNITION] and "tag_ids" not in self.data_signature:
+            raise ValueError("No tag ids present. Cannot train 'DIETClassifier'.")
+
     def _create_metrics(self):
         # self.metrics preserve order
         # output losses first
@@ -931,7 +948,7 @@ def _create_metrics(self):
     def _update_metrics_to_log(self) -> None:
         if self.config[MASKED_LM]:
             self.metrics_to_log += ["m_loss", "m_acc"]
-        if self.config[INTENT_CLASSIFICATION]:
+        if self.config[LABEL_CLASSIFICATION]:
             self.metrics_to_log += ["i_loss", "i_acc"]
         if self.config[ENTITY_RECOGNITION]:
             self.metrics_to_log += ["e_loss", "e_f1"]
@@ -940,7 +957,7 @@ def _prepare_layers(self) -> None:
         self._prepare_sequence_layers()
         if self.config[MASKED_LM]:
             self._prepare_mask_lm_layers()
-        if self.config[INTENT_CLASSIFICATION]:
+        if self.config[LABEL_CLASSIFICATION]:
             self._prepare_intent_classification_layers()
         if self.config[ENTITY_RECOGNITION]:
             self._prepare_entity_recognition_layers()
@@ -971,14 +988,13 @@ def _prepare_sequence_layers(self) -> None:
         self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
             rate=self.config[DROPRATE]
         )
-        if "text_features" in self.data_signature:
-            self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
-                self.data_signature["text_features"],
-                "text",
-                self.config[C2],
-                self.config[DENSE_DIM]["text"],
-            )
-        if "label_features" in self.data_signature:
+        self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
+            self.data_signature["text_features"],
+            "text",
+            self.config[C2],
+            self.config[DENSE_DIM]["text"],
+        )
+        if self.config[LABEL_CLASSIFICATION]:
             self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
                 self.data_signature["label_features"],
                 "label",
@@ -991,7 +1007,7 @@ def _prepare_sequence_layers(self) -> None:
             self.config[C2],
             "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
         )
-        if self.config[INTENT_CLASSIFICATION]:
+        if self.config[LABEL_CLASSIFICATION]:
             self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
                 self.config[HIDDEN_LAYERS_SIZES_LABEL],
                 self.config[DROPRATE],
@@ -1228,7 +1244,7 @@ def batch_loss(
             self.mask_acc.update_state(acc)
             losses.append(loss)
 
-        if self.config[INTENT_CLASSIFICATION]:
+        if self.config[LABEL_CLASSIFICATION]:
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
 
@@ -1267,7 +1283,7 @@ def batch_predict(
         )
 
         out = {}
-        if self.config[INTENT_CLASSIFICATION]:
+        if self.config[LABEL_CLASSIFICATION]:
             if self.all_labels_embed is None:
                 _, self.all_labels_embed = self._create_all_labels()
 
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3071a0ab04dc..8ac3da8eca80 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -28,7 +28,7 @@
     SPARSE_INPUT_DROPOUT,
     MASKED_LM,
     ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
+    LABEL_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     DROPRATE,
@@ -133,7 +133,7 @@ def __init__(
         component_config = component_config or {}
 
         # the following properties are fixed for the EmbeddingIntentClassifier
-        component_config[INTENT_CLASSIFICATION] = True
+        component_config[LABEL_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
         component_config[BILOU_FLAG] = False
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 9b4da1960813..8bb2c0cbfc2e 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -32,7 +32,7 @@
     SPARSE_INPUT_DROPOUT,
     MASKED_LM,
     ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
+    LABEL_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
@@ -126,7 +126,7 @@ def __init__(
         component_config = component_config or {}
 
         # the following properties are fixed for the CRFEntityExtractor
-        component_config[INTENT_CLASSIFICATION] = False
+        component_config[LABEL_CLASSIFICATION] = False
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
         component_config[NUM_TRANSFORMER_LAYERS] = 0
@@ -207,7 +207,7 @@ def load(
             data_example,
         ) = cls._load_from_files(meta, model_dir)
 
-        meta[INTENT_CLASSIFICATION] = False
+        meta[LABEL_CLASSIFICATION] = False
         meta[ENTITY_RECOGNITION] = True
         meta[MASKED_LM] = False
         meta[NUM_TRANSFORMER_LAYERS] = 0
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index eba0bd50afdd..5aad39dbc266 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -41,7 +41,7 @@
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.utils.common import class_from_module_path, raise_warning
 from rasa.utils.tensorflow.constants import (
-    INTENT_CLASSIFICATION,
+    LABEL_CLASSIFICATION,
     ENTITY_RECOGNITION,
     NUM_TRANSFORMER_LAYERS,
 )
@@ -127,7 +127,7 @@
         {"name": "LexicalSyntacticFeaturizer"},
         {
             "name": "DIETClassifier",
-            INTENT_CLASSIFICATION: False,
+            LABEL_CLASSIFICATION: False,
             ENTITY_RECOGNITION: True,
             NUM_TRANSFORMER_LAYERS: 0,
         },
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 1b20938306a7..f4e8e5b7efb5 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -25,7 +25,7 @@
     SPARSE_INPUT_DROPOUT,
     MASKED_LM,
     ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
+    LABEL_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
@@ -173,7 +173,7 @@ def __init__(
         component_config = component_config or {}
 
         # the following properties are fixed for the ResponseSelector
-        component_config[INTENT_CLASSIFICATION] = True
+        component_config[LABEL_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
         component_config[BILOU_FLAG] = False
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index bbc959acca92..1a6852ff8f02 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -38,7 +38,7 @@
 EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
 EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
 
-INTENT_CLASSIFICATION = "intent_classification"
+LABEL_CLASSIFICATION = "label_classification"
 ENTITY_RECOGNITION = "entity_recognition"
 MASKED_LM = "use_masked_language_model"
 
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index ffd804dc1e79..bb4df7240428 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -23,7 +23,7 @@ class FeatureSignature(NamedTuple):
 class RasaModelData:
     def __init__(self, label_key: Optional[Text] = None, data: Data = None):
         self.data = data or {}
-        self.label_key = label_key or ""
+        self.label_key = label_key
         # will be updated when features are added
         self.num_examples = self.get_number_of_examples()
 
@@ -80,7 +80,7 @@ def split(
     ) -> Tuple["RasaModelData", "RasaModelData"]:
         """Create random hold out test set using stratified split."""
 
-        self._check_label_key(self.label_key)
+        self._check_label_key()
 
         label_ids = self._create_label_ids(self.data[self.label_key][0])
         label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
@@ -168,11 +168,10 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
         that more populated classes should appear more often.
         """
 
-        if self.label_key not in data or len(data[self.label_key]) > 1:
-            raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
+        self._check_label_key()
 
         # skip balancing if labels are token based
-        if data[self.label_key][0][0].size > 1:
+        if self.label_key is None or data[self.label_key][0][0].size > 1:
             logger.debug(
                 f"Skip balancing data for '{self.label_key}' as data is a sequence."
             )
@@ -412,9 +411,11 @@ def _split_by_label_ids(
             )
         return label_data
 
-    def _check_label_key(self, label_key: Text):
-        if label_key not in self.data or len(self.data[label_key]) > 1:
-            raise ValueError(f"Key '{label_key}' not in RasaModelData.")
+    def _check_label_key(self):
+        if self.label_key is not None and (
+            self.label_key not in self.data or len(self.data[self.label_key]) > 1
+        ):
+            raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
 
     def _convert_train_test_split(
         self, output_values: List[Any], solo_values: List[Any]

From 576d8f4388109bf4d53d9cc8eb4397a742f8b33f Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 17:41:37 +0100
Subject: [PATCH 295/633] quick review comments. Tests WIP

---
 rasa/nlu/constants.py                           | 11 +++++++----
 .../dense_featurizer/lm_featurizer.py           |  6 +++---
 rasa/nlu/tokenizers/lm_tokenizer.py             |  6 +++---
 rasa/nlu/utils/hugging_face/hf_transformers.py  | 17 +++++++----------
 rasa/nlu/utils/hugging_face/registry.py         | 15 ++++++++++++---
 5 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index b22cd0d4ecf6..b118df0ca15c 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -35,10 +35,13 @@
     RESPONSE_ATTRIBUTE: "response_dense_features",
 }
 
-SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
-TRANSFORMERS_DOCS = {
-    TEXT_ATTRIBUTE: "text_transformers_doc",
-    RESPONSE_ATTRIBUTE: "response_transformers_doc",
+SPACY_DOCS = {
+    TEXT_ATTRIBUTE: "text_spacy_doc",
+    RESPONSE_ATTRIBUTE: "response_spacy_doc",
+}
+HF_TRANSFORMERS_DOCS = {
+    TEXT_ATTRIBUTE: "text_hf_transformers_doc",
+    RESPONSE_ATTRIBUTE: "response_hf_transformers_doc",
 }
 
 DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 02a0091d8595..4a74a898018b 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -11,7 +11,7 @@
 
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    TRANSFORMERS_DOCS,
+    HF_TRANSFORMERS_DOCS,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
     TOKENS_NAMES,
@@ -25,7 +25,7 @@ class LanguageModelFeaturizer(Featurizer):
     ]
 
     requires = [
-        TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+        HF_TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     def train(
@@ -41,7 +41,7 @@ def train(
 
     def get_doc(self, message: Message, attribute: Text) -> Any:
 
-        return message.get(TRANSFORMERS_DOCS[attribute])
+        return message.get(HF_TRANSFORMERS_DOCS[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index cd951fdb3602..fca7f2f563ce 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -6,7 +6,7 @@
 
 from rasa.nlu.constants import (
     TOKENS_NAMES,
-    TRANSFORMERS_DOCS,
+    HF_TRANSFORMERS_DOCS,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
@@ -16,7 +16,7 @@ class LanguageModelTokenizer(Tokenizer):
     provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     requires = [
-        TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+        HF_TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -27,7 +27,7 @@ class LanguageModelTokenizer(Tokenizer):
     }
 
     def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
-        return message.get(TRANSFORMERS_DOCS[attribute])
+        return message.get(HF_TRANSFORMERS_DOCS[attribute])
 
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         doc = self.get_doc(message, attribute)
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 172418e3335d..1a05e23309fd 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -19,19 +19,16 @@
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from transformers import *
-
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    TRANSFORMERS_DOCS,
+    HF_TRANSFORMERS_DOCS,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
 
 class HFTransformersNLP(Component):
     provides = [
-        TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+        HF_TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -250,9 +247,9 @@ def _compute_batch_sequence_features(self, batch_attention_mask, padded_token_id
         model_outputs = self.model(
             np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
         )
-        sequence_hidden_states = model_outputs[
-            0
-        ]  # sequence hidden states is always the first output from all models
+
+        # sequence hidden states is always the first output from all models
+        sequence_hidden_states = model_outputs[0]
 
         sequence_hidden_states = sequence_hidden_states.numpy()
         return sequence_hidden_states
@@ -346,13 +343,13 @@ def train(
                 batch_docs = self._get_docs_for_batch(batch_messages, attribute)
 
                 for index, ex in enumerate(batch_messages):
-                    ex.set(TRANSFORMERS_DOCS[attribute], batch_docs[index])
+                    ex.set(HF_TRANSFORMERS_DOCS[attribute], batch_docs[index])
 
                 batch_start_index += batch_size
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set(
-            TRANSFORMERS_DOCS[TEXT_ATTRIBUTE],
+            HF_TRANSFORMERS_DOCS[TEXT_ATTRIBUTE],
             self._get_docs_for_batch([message], attribute=TEXT_ATTRIBUTE)[0],
         )
diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
index 40a491c0fc09..3cb771dafcd4 100644
--- a/rasa/nlu/utils/hugging_face/registry.py
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -3,18 +3,27 @@
     TFOpenAIGPTModel,
     TFGPT2Model,
     TFXLNetModel,
-    TFXLMModel,
+    # TFXLMModel,
     TFDistilBertModel,
     TFRobertaModel,
     BertTokenizer,
     OpenAIGPTTokenizer,
     GPT2Tokenizer,
     XLNetTokenizer,
-    XLMTokenizer,
+    # XLMTokenizer,
     DistilBertTokenizer,
     RobertaTokenizer,
 )
-from rasa.nlu.utils.hugging_face.transformers_pre_post_processors import *
+from rasa.nlu.utils.hugging_face.transformers_pre_post_processors import (
+    bert_tokens_pre_processor,
+    gpt_tokens_pre_processor,
+    xlnet_tokens_pre_processor,
+    roberta_tokens_pre_processor,
+    bert_embeddings_post_processor,
+    gpt_embeddings_post_processor,
+    xlnet_embeddings_post_processor,
+    roberta_embeddings_post_processor,
+)
 
 model_class_dict = {
     "bert": TFBertModel,

From 4c3f218c16348c463b59ad7859f77ce032d66a3b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 4 Feb 2020 17:43:17 +0100
Subject: [PATCH 296/633] fix imports

---
 rasa/nlu/utils/hugging_face/hf_transformers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 2b8b3aae22fa..a77e9fc76d39 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -66,7 +66,7 @@ def _load_model(self) -> None:
             )
             self.model_weights = model_weights_defaults[self.model_name]
 
-        logger.info(f"Loading Tokenizer and Model for {self.model_name}")
+        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")
         self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
             self.model_weights
         )
@@ -224,7 +224,6 @@ def _add_padding_to_batch(self, batch_token_ids):
         # Add padding according to max_seq_len
         # Some models don't contain pad token, we use unknown token as padding token.This doesn't affect the computation
         # since we compute an attention mask anyways.
-        # pad_token_id = self.tokenizer.pad_token_id if self.contains_special_token else self.tokenizer.unk_token_id
         for example_token_ids in batch_token_ids:
             padded_token_ids.append(
                 example_token_ids

From 49e9a157486db6727e281d728631e7bedb149152 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 5 Feb 2020 01:54:12 +0100
Subject: [PATCH 297/633] bug fix to swap seq and sentence embeddings

---
 rasa/nlu/constants.py                         |   6 +-
 .../dense_featurizer/lm_featurizer.py         |  18 ++-
 rasa/nlu/tokenizers/lm_tokenizer.py           |   7 +-
 .../nlu/utils/hugging_face/hf_transformers.py |  18 ++-
 .../transformers_pre_post_processors.py       |  18 +--
 tests/nlu/tokenizers/test_lm_tokenizer.py     | 117 ++++++++++++++++++
 6 files changed, 153 insertions(+), 31 deletions(-)
 create mode 100644 tests/nlu/tokenizers/test_lm_tokenizer.py

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index b118df0ca15c..324a874977b6 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -39,9 +39,9 @@
     TEXT_ATTRIBUTE: "text_spacy_doc",
     RESPONSE_ATTRIBUTE: "response_spacy_doc",
 }
-HF_TRANSFORMERS_DOCS = {
-    TEXT_ATTRIBUTE: "text_hf_transformers_doc",
-    RESPONSE_ATTRIBUTE: "response_hf_transformers_doc",
+LANGUAGE_MODEL_DOCS = {
+    TEXT_ATTRIBUTE: "text_language_model_doc",
+    RESPONSE_ATTRIBUTE: "response_language_model_doc",
 }
 
 DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 4a74a898018b..a6b911e0e61b 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -1,17 +1,13 @@
 import numpy as np
-import typing
 from typing import Any, Optional, Text
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.training_data import Message, TrainingData
 
-if typing.TYPE_CHECKING:
-    from spacy.tokens import Doc
-
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    HF_TRANSFORMERS_DOCS,
+    LANGUAGE_MODEL_DOCS,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
     TOKENS_NAMES,
@@ -25,7 +21,7 @@ class LanguageModelFeaturizer(Featurizer):
     ]
 
     requires = [
-        HF_TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     def train(
@@ -41,7 +37,7 @@ def train(
 
     def get_doc(self, message: Message, attribute: Text) -> Any:
 
-        return message.get(HF_TRANSFORMERS_DOCS[attribute])
+        return message.get(LANGUAGE_MODEL_DOCS[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
@@ -50,11 +46,11 @@ def process(self, message: Message, **kwargs: Any) -> None:
     def _set_lm_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
         """Adds the precomputed word vectors to the messages features."""
 
-        message_attribute_doc = self.get_doc(message, attribute)
+        doc = self.get_doc(message, attribute)
 
-        if message_attribute_doc is not None:
-            sequence_features = message_attribute_doc["sequence_features"]
-            sentence_features = message_attribute_doc["sentence_features"]
+        if doc is not None:
+            sequence_features = doc["sequence_features"]
+            sentence_features = doc["sentence_features"]
 
             features = np.concatenate([sequence_features, sentence_features])
 
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index fca7f2f563ce..5501b1d268ae 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -1,4 +1,3 @@
-import typing
 from typing import Text, List, Any, Dict
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
@@ -6,7 +5,7 @@
 
 from rasa.nlu.constants import (
     TOKENS_NAMES,
-    HF_TRANSFORMERS_DOCS,
+    LANGUAGE_MODEL_DOCS,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
@@ -16,7 +15,7 @@ class LanguageModelTokenizer(Tokenizer):
     provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     requires = [
-        HF_TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -27,7 +26,7 @@ class LanguageModelTokenizer(Tokenizer):
     }
 
     def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
-        return message.get(HF_TRANSFORMERS_DOCS[attribute])
+        return message.get(LANGUAGE_MODEL_DOCS[attribute])
 
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         doc = self.get_doc(message, attribute)
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index a77e9fc76d39..7b5df1860e98 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -21,14 +21,14 @@
 
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    HF_TRANSFORMERS_DOCS,
+    LANGUAGE_MODEL_DOCS,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
 
 class HFTransformersNLP(Component):
     provides = [
-        HF_TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -297,8 +297,8 @@ def _get_docs_for_batch(
         )
 
         (
-            batch_sequence_features,
             batch_sentence_features,
+            batch_sequence_features,
         ) = self._get_model_features_for_batch(batch_token_ids)
 
         # A doc consists of {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
@@ -322,6 +322,8 @@ def train(
 
         batch_size = 64
 
+        all_docs = []
+
         for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
 
             non_empty_examples = list(
@@ -342,13 +344,19 @@ def train(
                 batch_docs = self._get_docs_for_batch(batch_messages, attribute)
 
                 for index, ex in enumerate(batch_messages):
-                    ex.set(HF_TRANSFORMERS_DOCS[attribute], batch_docs[index])
+
+                    ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])
+                    all_docs.append((ex.get(TEXT_ATTRIBUTE), batch_docs[index]))
 
                 batch_start_index += batch_size
 
+        # import pickle
+        # with open('inside_rasa_scaffold.pkl','wb') as f:
+        #     pickle.dump(all_docs, f)
+
     def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set(
-            HF_TRANSFORMERS_DOCS[TEXT_ATTRIBUTE],
+            LANGUAGE_MODEL_DOCS[TEXT_ATTRIBUTE],
             self._get_docs_for_batch([message], attribute=TEXT_ATTRIBUTE)[0],
         )
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
index c893ca4b6798..072d9237d539 100644
--- a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -8,10 +8,12 @@ def bert_tokens_pre_processor(token_ids: List[int]):
     BERT_CLS_ID = 101
     BERT_SEP_ID = 102
 
-    token_ids.insert(0, BERT_CLS_ID)
-    token_ids.append(BERT_SEP_ID)
+    processed_tokens = token_ids
 
-    return token_ids
+    processed_tokens.insert(0, BERT_CLS_ID)
+    processed_tokens.append(BERT_SEP_ID)
+
+    return processed_tokens
 
 
 def gpt_tokens_pre_processor(token_ids: List[int]):
@@ -64,7 +66,7 @@ def bert_embeddings_post_processor(
     sentence_embedding = sequence_embeddings[0]
     post_processed_embedding = sequence_embeddings[1:-1]
 
-    return post_processed_embedding, sentence_embedding
+    return sentence_embedding, post_processed_embedding
 
 
 def gpt_embeddings_post_processor(
@@ -77,7 +79,7 @@ def gpt_embeddings_post_processor(
     sentence_embedding = np.mean(sequence_embeddings, axis=0)
     post_processed_embedding = sequence_embeddings
 
-    return post_processed_embedding, sentence_embedding
+    return sentence_embedding, post_processed_embedding
 
 
 def xlnet_embeddings_post_processor(
@@ -91,7 +93,7 @@ def xlnet_embeddings_post_processor(
     post_processed_embedding = sequence_embeddings[:-2]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
 
-    return post_processed_embedding, sentence_embedding
+    return sentence_embedding, post_processed_embedding
 
 
 def roberta_embeddings_post_processor(
@@ -105,7 +107,7 @@ def roberta_embeddings_post_processor(
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
 
-    return post_processed_embedding, sentence_embedding
+    return sentence_embedding, post_processed_embedding
 
 
 def xlm_embeddings_post_processor(
@@ -119,4 +121,4 @@ def xlm_embeddings_post_processor(
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
 
-    return post_processed_embedding, sentence_embedding
+    return sentence_embedding, post_processed_embedding
diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py
new file mode 100644
index 000000000000..b1f89a00ab9e
--- /dev/null
+++ b/tests/nlu/tokenizers/test_lm_tokenizer.py
@@ -0,0 +1,117 @@
+import pytest
+
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+
+
+@pytest.mark.parametrize(
+    "model_name, texts, expected_tokens, expected_indices",
+    [
+        (
+            "bert",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "ńöñàśçií",
+                "leaving ",
+            ],
+            [
+                ["good", "evening"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                ["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)],
+            ],
+        ),
+        #         (
+        #             "openaigpt",
+        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
+        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
+        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
+        #         ),
+        #         (
+        #             "gpt2",
+        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
+        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
+        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
+        #         ),
+        #         (
+        #             "xlnet",
+        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
+        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
+        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
+        #         ),
+        #         (
+        #             "distilbert",
+        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
+        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
+        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
+        #         ),
+        # (
+        #             "roberta",
+        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
+        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
+        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
+        #         ),
+    ],
+)
+def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices):
+
+    print("model name", model_name)
+    transformers_config = {"model_name": model_name}
+
+    transformers_nlp = HFTransformersNLP(transformers_config)
+    lm_tokenizer = LanguageModelTokenizer()
+
+    for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices):
+        message = Message.build(text=text)
+        transformers_nlp.process(message)
+        tokens = lm_tokenizer.tokenize(message, TEXT_ATTRIBUTE)
+
+        print(text)
+        print([t.text for t in tokens])
+        print([(t.start, t.end) for t in tokens])
+        print("-----------------------------------")
+
+        # assert [t.text for t in tokens] == expected_tokens
+        # assert [t.start for t in tokens] == [i[0] for i in expected_indices]
+        # assert [t.end for t in tokens] == [i[1] for i in expected_indices]
+
+    print("=================================")
+    assert True == False
+
+
+# @pytest.mark.parametrize(
+#     "text, expected_tokens",
+#     [
+#         ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]),
+#         ("Forecast for LUNCH", ["Forecast for LUNCH"]),
+#     ],
+# )
+# def test_custom_intent_symbol(text, expected_tokens):
+#     component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
+#
+#     tk = ConveRTTokenizer(component_config)
+#
+#     message = Message(text)
+#     message.set(INTENT_ATTRIBUTE, text)
+#
+#     tk.train(TrainingData([message]))
+#
+#     assert [
+#         t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
+#     ] == expected_tokens

From e990e8fc10ed8f2f2e44944c492b13370e35a5e0 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 5 Feb 2020 13:22:09 +0100
Subject: [PATCH 298/633] tests for tokenizers are in

---
 .../nlu/utils/hugging_face/hf_transformers.py |  21 +-
 rasa/nlu/utils/hugging_face/registry.py       |  15 +
 .../transformers_pre_post_processors.py       |  30 +-
 tests/nlu/tokenizers/test_lm_tokenizer.py     | 361 ++++++++++++++----
 4 files changed, 348 insertions(+), 79 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 7b5df1860e98..8c1377c6fac7 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -1,6 +1,5 @@
 import logging
-import typing
-from typing import Any, Dict, List, Text, Tuple
+from typing import Any, Dict, List, Text, Tuple, Optional
 
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.components import Component
@@ -15,6 +14,7 @@
     model_weights_defaults,
     model_special_tokens_pre_processors,
     model_embeddings_post_processors,
+    model_tokens_cleaners,
 )
 
 logger = logging.getLogger(__name__)
@@ -102,6 +102,9 @@ def _add_lm_specific_special_tokens(
         ]
         return augmented_tokens
 
+    def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
+        return model_tokens_cleaners[self.model_name](token_strings)
+
     def _post_process_sequence_embeddings(
         self, sequence_embeddings: np.array
     ) -> Tuple[np.array, np.array]:
@@ -173,6 +176,8 @@ def _tokenize_example(self, message: Message, attribute: Text):
             # use lm specific tokenizer to further tokenize the text
             split_token_ids, split_token_strings = self._lm_tokenize(token_text)
 
+            split_token_strings = self._lm_specific_token_cleanup(split_token_strings)
+
             token_ids_out += split_token_ids
 
             _aligned_tokens = self._align_tokens(
@@ -317,13 +322,14 @@ def _get_docs_for_batch(
         return batch_docs
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         batch_size = 64
 
-        all_docs = []
-
         for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
 
             non_empty_examples = list(
@@ -346,14 +352,9 @@ def train(
                 for index, ex in enumerate(batch_messages):
 
                     ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])
-                    all_docs.append((ex.get(TEXT_ATTRIBUTE), batch_docs[index]))
 
                 batch_start_index += batch_size
 
-        # import pickle
-        # with open('inside_rasa_scaffold.pkl','wb') as f:
-        #     pickle.dump(all_docs, f)
-
     def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set(
diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
index 3cb771dafcd4..e1f6dfc6f8e9 100644
--- a/rasa/nlu/utils/hugging_face/registry.py
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -23,8 +23,13 @@
     gpt_embeddings_post_processor,
     xlnet_embeddings_post_processor,
     roberta_embeddings_post_processor,
+    bert_tokens_cleaner,
+    openaigpt_tokens_cleaner,
+    gpt2_tokens_cleaner,
+    xlnet_tokens_cleaner,
 )
 
+
 model_class_dict = {
     "bert": TFBertModel,
     "openaigpt": TFOpenAIGPTModel,
@@ -63,6 +68,16 @@
     "roberta": roberta_tokens_pre_processor,
 }
 
+model_tokens_cleaners = {
+    "bert": bert_tokens_cleaner,
+    "openaigpt": openaigpt_tokens_cleaner,
+    "gpt2": gpt2_tokens_cleaner,
+    "xlnet": xlnet_tokens_cleaner,
+    # "xlm": xlm_tokens_pre_processor,
+    "distilbert": bert_tokens_cleaner,  # uses the same as BERT
+    "roberta": gpt2_tokens_cleaner,  # Uses the same as GPT2
+}
+
 model_embeddings_post_processors = {
     "bert": bert_embeddings_post_processor,
     "openaigpt": gpt_embeddings_post_processor,
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
index 072d9237d539..854e8601aa59 100644
--- a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Tuple, Text
 import numpy as np
 
 
@@ -122,3 +122,31 @@ def xlm_embeddings_post_processor(
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
 
     return sentence_embedding, post_processed_embedding
+
+
+def bert_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token into sub-tokens"""
+
+    tokens = [string.replace("##", "") for string in token_strings]
+    return [string for string in tokens if string]
+
+
+def openaigpt_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a token into sub-tokens"""
+
+    tokens = [string.replace("</w>", "") for string in token_strings]
+    return [string for string in tokens if string]
+
+
+def gpt2_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token into sub-tokens"""
+
+    tokens = [string.replace("Ġ", "") for string in token_strings]
+    return [string for string in tokens if string]
+
+
+def xlnet_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token into sub-tokens"""
+
+    tokens = [string.replace("▁", "") for string in token_strings]
+    return [string for string in tokens if string]
diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py
index b1f89a00ab9e..4f2baee8f9ac 100644
--- a/tests/nlu/tokenizers/test_lm_tokenizer.py
+++ b/tests/nlu/tokenizers/test_lm_tokenizer.py
@@ -11,14 +11,61 @@
     [
         (
             "bert",
+            [
+                "Good evening.",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["good", "evening"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sentence",
+                    "i",
+                    "want",
+                    "em",
+                    "bed",
+                    "ding",
+                    "s",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 30),
+                    (30, 33),
+                    (33, 37),
+                    (37, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "openaigpt",
             [
                 "Good evening.",
                 "hello",
                 "you're",
                 "r. n. b.",
                 "rock & roll",
-                "ńöñàśçií",
-                "leaving ",
+                "here is the sentence I want embeddings for.",
             ],
             [
                 ["good", "evening"],
@@ -26,7 +73,106 @@
                 ["you", "re"],
                 ["r", "n", "b"],
                 ["rock", "&", "roll"],
-                ["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"],
+                ["here", "is", "the", "sentence", "i", "want", "embe", "ddings", "for"],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 32),
+                    (32, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "gpt2",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["Good", "even", "ing"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sent",
+                    "ence",
+                    "I",
+                    "want",
+                    "embed",
+                    "d",
+                    "ings",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 9), (9, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 16),
+                    (16, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 33),
+                    (33, 34),
+                    (34, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "xlnet",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["Good", "evening"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sentence",
+                    "I",
+                    "want",
+                    "embed",
+                    "ding",
+                    "s",
+                    "for",
+                ],
             ],
             [
                 [(0, 4), (5, 12)],
@@ -34,84 +180,163 @@
                 [(0, 3), (4, 6)],
                 [(0, 1), (3, 4), (6, 7)],
                 [(0, 4), (5, 6), (7, 11)],
-                [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 33),
+                    (33, 37),
+                    (37, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "distilbert",
+            [
+                "Good evening.",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["good", "evening"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sentence",
+                    "i",
+                    "want",
+                    "em",
+                    "bed",
+                    "ding",
+                    "s",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 30),
+                    (30, 33),
+                    (33, 37),
+                    (37, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "roberta",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["Good", "even", "ing"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sent",
+                    "ence",
+                    "I",
+                    "want",
+                    "embed",
+                    "d",
+                    "ings",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 9), (9, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 16),
+                    (16, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 33),
+                    (33, 34),
+                    (34, 38),
+                    (39, 42),
+                ],
             ],
         ),
-        #         (
-        #             "openaigpt",
-        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
-        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
-        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
-        #         ),
-        #         (
-        #             "gpt2",
-        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
-        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
-        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
-        #         ),
-        #         (
-        #             "xlnet",
-        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
-        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
-        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
-        #         ),
-        #         (
-        #             "distilbert",
-        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
-        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
-        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
-        #         ),
-        # (
-        #             "roberta",
-        #             ["Good evening.","hello", "you're", "r. n. b.", "rock & roll", "ńöñàśçií"],
-        #             [["good", "evening"],["hello"],["you", "re"],["r", "n", "b"],["rock", "&", "roll"],["ń", "ö", "ñ", "à", "ś", "ç", "i", "í"]],
-        #             [[(0, 8), (9, 12), (13, 18)],[(0, 5)],[(0, 3), (4, 6)],[(0, 1), (3, 4), (6, 7)],[(0, 4), (5, 6), (7, 11)],[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8)]],
-        #         ),
     ],
 )
 def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices):
 
-    print("model name", model_name)
     transformers_config = {"model_name": model_name}
 
     transformers_nlp = HFTransformersNLP(transformers_config)
     lm_tokenizer = LanguageModelTokenizer()
 
     for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices):
+
         message = Message.build(text=text)
         transformers_nlp.process(message)
         tokens = lm_tokenizer.tokenize(message, TEXT_ATTRIBUTE)
 
-        print(text)
-        print([t.text for t in tokens])
-        print([(t.start, t.end) for t in tokens])
-        print("-----------------------------------")
-
-        # assert [t.text for t in tokens] == expected_tokens
-        # assert [t.start for t in tokens] == [i[0] for i in expected_indices]
-        # assert [t.end for t in tokens] == [i[1] for i in expected_indices]
-
-    print("=================================")
-    assert True == False
-
-
-# @pytest.mark.parametrize(
-#     "text, expected_tokens",
-#     [
-#         ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]),
-#         ("Forecast for LUNCH", ["Forecast for LUNCH"]),
-#     ],
-# )
-# def test_custom_intent_symbol(text, expected_tokens):
-#     component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
-#
-#     tk = ConveRTTokenizer(component_config)
-#
-#     message = Message(text)
-#     message.set(INTENT_ATTRIBUTE, text)
-#
-#     tk.train(TrainingData([message]))
-#
-#     assert [
-#         t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-#     ] == expected_tokens
+        assert [t.text for t in tokens] == gt_tokens
+        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
+        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
+
+
+@pytest.mark.parametrize(
+    "text, expected_tokens",
+    [
+        ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]),
+        ("Forecast for LUNCH", ["Forecast for LUNCH"]),
+        ("Forecast+for+LUNCH", ["Forecast", "for", "LUNCH"]),
+    ],
+)
+def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
+    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
+
+    transformers_config = {"model_name": "bert"}  # Test for one should be enough
+
+    transformers_nlp = HFTransformersNLP(transformers_config)
+    lm_tokenizer = LanguageModelTokenizer(component_config)
+
+    message = Message(text)
+    message.set(INTENT_ATTRIBUTE, text)
+
+    td = TrainingData([message])
+
+    transformers_nlp.train(td)
+    lm_tokenizer.train(td)
+
+    assert [
+        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
+    ] == expected_tokens

From 99bad36f2fc0fbe97a5bd6f662faa125ec50adc0 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 5 Feb 2020 15:06:30 +0100
Subject: [PATCH 299/633] added featurizer tests

---
 .../dense_featurizer/lm_featurizer.py         |   4 +-
 tests/nlu/featurizers/test_lm_featurizer.py   | 227 ++++++++++++++++++
 2 files changed, 229 insertions(+), 2 deletions(-)
 create mode 100644 tests/nlu/featurizers/test_lm_featurizer.py

diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index a6b911e0e61b..4fb3d7feff02 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -27,11 +27,11 @@ class LanguageModelFeaturizer(Featurizer):
     def train(
         self,
         training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
-        for example in training_data.intent_examples:
+        for example in training_data.training_examples:
             for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                 self._set_lm_features(example, attribute)
 
diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
new file mode 100644
index 000000000000..3fd85894fcb5
--- /dev/null
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -0,0 +1,227 @@
+import numpy as np
+import pytest
+
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+from rasa.nlu.constants import (
+    TEXT_ATTRIBUTE,
+    DENSE_FEATURE_NAMES,
+    TOKENS_NAMES,
+    RESPONSE_ATTRIBUTE,
+    INTENT_ATTRIBUTE,
+    LANGUAGE_MODEL_DOCS,
+)
+from rasa.nlu.training_data import Message
+
+
+@pytest.mark.parametrize(
+    "model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec",
+    [
+        (
+            "bert",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (12, 768)],
+            [
+                [0.5727445, -0.16078179],
+                [-0.5485125, 0.09632876, -0.4278888, 0.11438395, 0.18316492],
+            ],
+            [
+                [0.068804, 0.32802248, -0.11250398, -0.11338018, -0.37116352],
+                [0.05909364, 0.06433402, 0.08569086, -0.16530034, -0.11396906],
+            ],
+        ),
+        (
+            "openaigpt",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (10, 768)],
+            [
+                [-0.0630323737859726, 0.4029877185821533],
+                [
+                    0.8072432279586792,
+                    -0.08990508317947388,
+                    0.9985930919647217,
+                    -0.38779014348983765,
+                    0.08921952545642853,
+                ],
+            ],
+            [
+                [
+                    0.16997766494750977,
+                    0.1493849903345108,
+                    0.39421725273132324,
+                    -0.5753618478775024,
+                    0.05096133053302765,
+                ],
+                [
+                    0.41056010127067566,
+                    -0.1169343888759613,
+                    -0.3019704818725586,
+                    -0.40207183361053467,
+                    0.6289798021316528,
+                ],
+            ],
+        ),
+        (
+            "gpt2",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(4, 768), (12, 768)],
+            [
+                [-0.033827826380729675, -0.10971662402153015, 0.002244209870696068],
+                [
+                    -0.18434514105319977,
+                    -0.5386468768119812,
+                    -0.11122681945562363,
+                    -1.368929147720337,
+                    -0.5397579669952393,
+                ],
+            ],
+            [
+                [
+                    -0.04710008203983307,
+                    -0.2793063223361969,
+                    -0.23804056644439697,
+                    -0.3212292492389679,
+                    0.11430201679468155,
+                ],
+                [
+                    -0.1809544414281845,
+                    -0.017152192071080208,
+                    -0.3176477551460266,
+                    -0.008387327194213867,
+                    0.3365338146686554,
+                ],
+            ],
+        ),
+        (
+            "xlnet",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (11, 768)],
+            [
+                [1.7612367868423462, 2.5819129943847656],
+                [
+                    0.784195065498352,
+                    0.7068007588386536,
+                    1.5883606672286987,
+                    1.891886591911316,
+                    2.5209126472473145,
+                ],
+            ],
+            [
+                [
+                    2.171574831008911,
+                    -1.5377449989318848,
+                    -3.2671749591827393,
+                    0.22520869970321655,
+                    -1.598855972290039,
+                ],
+                [
+                    1.6516317129135132,
+                    0.021670114248991013,
+                    -2.5114030838012695,
+                    1.447351098060608,
+                    -2.5866634845733643,
+                ],
+            ],
+        ),
+        (
+            "distilbert",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (12, 768)],
+            [
+                [0.22866562008857727, -0.0575055330991745],
+                [
+                    -0.6448041796684265,
+                    -0.5105321407318115,
+                    -0.4892978072166443,
+                    0.17531153559684753,
+                    0.22717803716659546,
+                ],
+            ],
+            [
+                [
+                    -0.09814466536045074,
+                    -0.07325993478298187,
+                    0.22358475625514984,
+                    -0.20274735987186432,
+                    -0.07363069802522659,
+                ],
+                [
+                    -0.146609365940094,
+                    -0.07373693585395813,
+                    0.016850866377353668,
+                    -0.2407529354095459,
+                    -0.0979844480752945,
+                ],
+            ],
+        ),
+        (
+            "roberta",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(4, 768), (12, 768)],
+            [
+                [-0.309267520904541, 0.12365783751010895, 0.06769893318414688],
+                [
+                    0.02152823843061924,
+                    -0.08026768267154694,
+                    -0.10808645188808441,
+                    0.20090824365615845,
+                    0.04756045714020729,
+                ],
+            ],
+            [
+                [
+                    -0.03930358216166496,
+                    0.034788478165864944,
+                    0.12246038764715195,
+                    0.08401528000831604,
+                    0.7026961445808411,
+                ],
+                [
+                    -0.018586941063404083,
+                    -0.09835464507341385,
+                    0.03242188319563866,
+                    0.09366855770349503,
+                    0.4458026587963104,
+                ],
+            ],
+        ),
+    ],
+)
+def test_lm_featurizer_shape_values(
+    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec
+):
+    transformers_config = {"model_name": model_name}
+
+    transformers_nlp = HFTransformersNLP(transformers_config)
+    lm_featurizer = LanguageModelFeaturizer()
+
+    messages = []
+    for text in texts:
+        messages.append(Message.build(text=text))
+    td = TrainingData(messages)
+
+    transformers_nlp.train(td)
+    lm_featurizer.train(td)
+
+    for index in range(len(texts)):
+
+        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+        computed_sequence_vec, computed_sentence_vec = (
+            computed_feature_vec[:-1],
+            computed_feature_vec[-1],
+        )
+
+        assert computed_feature_vec.shape == expected_shape[index]
+
+        # Look at the value of first dimension for a few starting timesteps
+        assert np.allclose(
+            computed_sequence_vec[: len(expected_sequence_vec[index]), 0],
+            expected_sequence_vec[index],
+            atol=1e-5,
+        )
+
+        # Look at the first value of first five dimensions
+        assert np.allclose(
+            computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
+        )

From 01c7de5efbcc3f71c929e16c37445abcb9e2ff0b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 5 Feb 2020 17:32:12 +0100
Subject: [PATCH 300/633] added documentation

---
 docs/nlu/components.rst                     | 105 ++++++++++++++++++++
 rasa/nlu/utils/hugging_face/registry.py     |  12 +--
 tests/nlu/featurizers/test_lm_featurizer.py |   6 +-
 tests/nlu/tokenizers/test_lm_tokenizer.py   |   2 +-
 4 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c5f6c2a8a060..973c635ebba6 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -82,6 +82,54 @@ SpacyNLP
           # between these two words, therefore setting this to `true`.
           case_sensitive: false
 
+
+.. _HFTransformersNLP:
+
+HFTransformersNLP
+~~~~~~~~~~~~~~~~~
+
+:Short: Transformers based pre-trained language model initializer
+:Outputs: nothing
+:Requires: nothing
+:Description:
+    Initializes specified pre-trained language model from HuggingFace's `Transformers library
+    <https://huggingface.co/transformers/>`__.  The component applies language model specific tokenization and featurization
+    to compute sequence and sentence level representations for each example in the training data.
+    Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
+    component for downstream NLU models.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+          - name: HFTransformersNLP
+
+            # Name of the language model to use
+            model_name: "bert"
+
+            # Shortcut name to specify architecture variation of the above model. Full list of supported architectures
+            # can be found at https://huggingface.co/transformers/pretrained_models.html . If left empty, it uses the
+            # default model architecture that original transformers library loads
+            model_weights: "bert-base-uncased"
+
+        #    +----------------+--------------+-------------------------+
+        #    | Language Model | Parameter    | Default value for       |
+        #    |                | "model_name" | "model_weights"         |
+        #    +----------------+--------------+-------------------------+
+        #    | BERT           | bert         | bert-base-uncased       |
+        #    +----------------+--------------+-------------------------+
+        #    | GPT            | gpt          | openai-gpt              |
+        #    +----------------+--------------+-------------------------+
+        #    | GPT-2          | gpt2         | gpt2                    |
+        #    +----------------+--------------+-------------------------+
+        #    | XLNet          | xlnet        | xlnet-base-cased        |
+        #    +----------------+--------------+-------------------------+
+        #    | DistilBERT     | distilbert   | distilbert-base-uncased |
+        #    +----------------+--------------+-------------------------+
+        #    | RoBERTa        | roberta      | roberta-base            |
+        #    +----------------+--------------+-------------------------+
+
+
 Text Featurizers
 ----------------
 
@@ -182,6 +230,40 @@ ConveRTFeaturizer
         - name: "ConveRTFeaturizer"
 
 
+.. _LanguageModelFeaturizer:
+
+LanguageModelFeaturizer
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+:Short:
+    Creates a vector representation of user message and response (if specified) using a pre-trained language model.
+:Outputs:
+    nothing, used as an input to intent classifiers and response selectors that need intent features and response
+    features respectively (e.g. ``DIETClassifier`` and ``ResponseSelector``)
+:Requires: :ref:`HFTransformersNLP`
+:Type: Dense featurizer
+:Description:
+    Creates features for intent classification and response selection.
+    Uses the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component to compute vector
+    representations of input text.
+
+    .. warning::
+        Please make sure that you use a language model which is pre-trained on the same language corpus as that of your
+        training data.
+
+:Configuration:
+
+    Include ``HFTransformersNLP`` component before this component. Also, use :ref:`LanguageModelTokenizer` to ensure tokens
+    are correctly set for all components throughout the pipeline.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "HFTransformersNLP"
+          model_name: # Name of language model to use
+        - name: "LanguageModelFeaturizer"
+
+
 RegexFeaturizer
 ~~~~~~~~~~~~~~~
 
@@ -784,6 +866,29 @@ ConveRTTokenizer
     Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
 
 
+.. _LanguageModelTokenizer:
+
+LanguageModelTokenizer
+~~~~~~~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer from pre-trained language models
+:Outputs: nothing
+:Requires: :ref:`HFTransformersNLP`
+:Description:
+    Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
+    Must be used whenever the ``LanguageModelFeaturizer`` is used.
+:Configuration:
+
+    Include ``HFTransformersNLP`` component upstream.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "HFTransformersNLP"
+          model_name: # name of language model to use
+        - name: "LanguageModelTokenizer"
+
+
 
 Entity Extractors
 -----------------
diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
index e1f6dfc6f8e9..2c52c8109266 100644
--- a/rasa/nlu/utils/hugging_face/registry.py
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -32,7 +32,7 @@
 
 model_class_dict = {
     "bert": TFBertModel,
-    "openaigpt": TFOpenAIGPTModel,
+    "gpt": TFOpenAIGPTModel,
     "gpt2": TFGPT2Model,
     "xlnet": TFXLNetModel,
     # "xlm": TFXLMModel, # Currently doesn't work because of a bug in transformers library https://github.com/huggingface/transformers/issues/2729
@@ -41,7 +41,7 @@
 }
 model_tokenizer_dict = {
     "bert": BertTokenizer,
-    "openaigpt": OpenAIGPTTokenizer,
+    "gpt": OpenAIGPTTokenizer,
     "gpt2": GPT2Tokenizer,
     "xlnet": XLNetTokenizer,
     # "xlm": XLMTokenizer,
@@ -50,7 +50,7 @@
 }
 model_weights_defaults = {
     "bert": "bert-base-uncased",
-    "openaigpt": "openai-gpt",
+    "gpt": "openai-gpt",
     "gpt2": "gpt2",
     "xlnet": "xlnet-base-cased",
     # "xlm": "xlm-mlm-enfr-1024",
@@ -60,7 +60,7 @@
 
 model_special_tokens_pre_processors = {
     "bert": bert_tokens_pre_processor,
-    "openaigpt": gpt_tokens_pre_processor,
+    "gpt": gpt_tokens_pre_processor,
     "gpt2": gpt_tokens_pre_processor,
     "xlnet": xlnet_tokens_pre_processor,
     # "xlm": xlm_tokens_pre_processor,
@@ -70,7 +70,7 @@
 
 model_tokens_cleaners = {
     "bert": bert_tokens_cleaner,
-    "openaigpt": openaigpt_tokens_cleaner,
+    "gpt": openaigpt_tokens_cleaner,
     "gpt2": gpt2_tokens_cleaner,
     "xlnet": xlnet_tokens_cleaner,
     # "xlm": xlm_tokens_pre_processor,
@@ -80,7 +80,7 @@
 
 model_embeddings_post_processors = {
     "bert": bert_embeddings_post_processor,
-    "openaigpt": gpt_embeddings_post_processor,
+    "gpt": gpt_embeddings_post_processor,
     "gpt2": gpt_embeddings_post_processor,
     "xlnet": xlnet_embeddings_post_processor,
     # "xlm": xlm_embeddings_post_processor,
diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
index 3fd85894fcb5..ac2ec662d72c 100644
--- a/tests/nlu/featurizers/test_lm_featurizer.py
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -32,7 +32,7 @@
             ],
         ),
         (
-            "openaigpt",
+            "gpt",
             ["Good evening.", "here is the sentence I want embeddings for."],
             [(3, 768), (10, 768)],
             [
@@ -225,3 +225,7 @@ def test_lm_featurizer_shape_values(
         assert np.allclose(
             computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
         )
+
+        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+
+        assert intent_vec is None
diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py
index 4f2baee8f9ac..50fb8cd60370 100644
--- a/tests/nlu/tokenizers/test_lm_tokenizer.py
+++ b/tests/nlu/tokenizers/test_lm_tokenizer.py
@@ -58,7 +58,7 @@
             ],
         ),
         (
-            "openaigpt",
+            "gpt",
             [
                 "Good evening.",
                 "hello",

From 3259788f3482c6e243ca19f6005837fdd1ea85be Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 5 Feb 2020 18:03:21 +0100
Subject: [PATCH 301/633] add changelog, move common method out of class

---
 changelog/5187.feature.rst                    |  7 +++
 rasa/nlu/tokenizers/convert_tokenizer.py      | 40 +---------------
 .../nlu/utils/hugging_face/hf_transformers.py | 47 +++----------------
 rasa/utils/train_utils.py                     | 37 ++++++++++++++-
 4 files changed, 51 insertions(+), 80 deletions(-)
 create mode 100644 changelog/5187.feature.rst

diff --git a/changelog/5187.feature.rst b/changelog/5187.feature.rst
new file mode 100644
index 000000000000..41e1ef79144c
--- /dev/null
+++ b/changelog/5187.feature.rst
@@ -0,0 +1,7 @@
+Integrate language models from HuggingFace's Transformers Library.
+
+Create a new NLP component ``HFTransformersNLP`` which tokenizes and featurizes incoming messages using a specified
+pre-trained model with the Transformers library as the backend.
+Create ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from HFTransformersNLP and
+sets them correctly for message object.
+Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa
\ No newline at end of file
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 620f396072c7..8f940e9fad24 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -4,6 +4,7 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.training_data import Message
 from rasa.nlu.constants import MESSAGE_ATTRIBUTES, TOKENS_NAMES
+from rasa.utils.train_utils import align_tokens
 import tensorflow as tf
 
 
@@ -69,9 +70,7 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
             # clean tokens (remove special chars and empty tokens)
             split_token_strings = self._clean_tokens(split_token_strings)
 
-            _aligned_tokens = self._align_tokens(
-                split_token_strings, token_end, token_start
-            )
+            _aligned_tokens = align_tokens(split_token_strings, token_end, token_start)
             tokens_out += _aligned_tokens
 
         return tokens_out
@@ -81,38 +80,3 @@ def _clean_tokens(self, tokens: List[bytes]):
 
         tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens]
         return [string for string in tokens if string]
-
-    def _align_tokens(self, tokens_in: List[Text], token_end: int, token_start: int):
-        """Align sub-tokens of ConveRT with tokens return by the WhitespaceTokenizer.
-
-        As ConveRT might split a single word into multiple tokens, we need to make
-        sure that the start and end value of first and last sub-token matches the
-        start and end value of the token return by the WhitespaceTokenizer as the
-        entities are using those start and end values.
-        """
-
-        tokens_out = []
-
-        current_token_offset = token_start
-
-        for index, string in enumerate(tokens_in):
-            if index == 0:
-                if index == len(tokens_in) - 1:
-                    s_token_end = token_end
-                else:
-                    s_token_end = current_token_offset + len(string)
-                tokens_out.append(Token(string, token_start, end=s_token_end))
-            elif index == len(tokens_in) - 1:
-                tokens_out.append(Token(string, current_token_offset, end=token_end))
-            else:
-                tokens_out.append(
-                    Token(
-                        string,
-                        current_token_offset,
-                        end=current_token_offset + len(string),
-                    )
-                )
-
-            current_token_offset += len(string)
-
-        return tokens_out
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 8c1377c6fac7..95b1dbb8ed3c 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -6,6 +6,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.utils.train_utils import align_tokens
 import numpy as np
 
 from rasa.nlu.utils.hugging_face.registry import (
@@ -52,10 +53,12 @@ def _load_model(self) -> None:
 
         if self.model_name not in model_class_dict:
             logger.error(
-                f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))}"
+                f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))} or create"
+                f"a new class inheriting from this class to support your model."
             )
             raise KeyError(
-                f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))}"
+                f"{self.model_name} not a valid model name. Choose from {str(list(model_class_dict.keys()))}or create"
+                f"a new class inheriting from this class to support your model."
             )
 
         self.model_weights = self.component_config["model_weights"]
@@ -126,42 +129,6 @@ def _post_process_sequence_embeddings(
             np.array(post_processed_sequence_embeddings),
         )
 
-    @staticmethod
-    def _align_tokens(tokens_in: List[Text], token_end: int, token_start: int):
-        """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.
-
-        As a language model might split a single word into multiple tokens, we need to make
-        sure that the start and end value of first and last sub-token matches the
-        start and end value of the token return by the WhitespaceTokenizer as the
-        entities are using those start and end values.
-        """
-
-        tokens_out = []
-
-        current_token_offset = token_start
-
-        for index, string in enumerate(tokens_in):
-            if index == 0:
-                if index == len(tokens_in) - 1:
-                    s_token_end = token_end
-                else:
-                    s_token_end = current_token_offset + len(string)
-                tokens_out.append(Token(string, token_start, end=s_token_end))
-            elif index == len(tokens_in) - 1:
-                tokens_out.append(Token(string, current_token_offset, end=token_end))
-            else:
-                tokens_out.append(
-                    Token(
-                        string,
-                        current_token_offset,
-                        end=current_token_offset + len(string),
-                    )
-                )
-
-            current_token_offset += len(string)
-
-        return tokens_out
-
     def _tokenize_example(self, message: Message, attribute: Text):
 
         tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)
@@ -180,9 +147,7 @@ def _tokenize_example(self, message: Message, attribute: Text):
 
             token_ids_out += split_token_ids
 
-            _aligned_tokens = self._align_tokens(
-                split_token_strings, token_end, token_start
-            )
+            _aligned_tokens = align_tokens(split_token_strings, token_end, token_start)
             tokens_out += _aligned_tokens
 
         return tokens_out, token_ids_out
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 17ebcb124ab8..19c51293e478 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,7 +1,8 @@
 import logging
 import tensorflow as tf
 import numpy as np
-from typing import Optional, Text, Dict, Any
+from typing import Optional, Text, Dict, Any, List
+from rasa.nlu.tokenizers.tokenizer import Token
 
 from rasa.utils.tensorflow.constants import SIMILARITY_TYPE, LOSS_TYPE
 
@@ -41,3 +42,37 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
             config[SIMILARITY_TYPE] = "cosine"
 
     return config
+
+
+def align_tokens(tokens_in: List[Text], token_end: int, token_start: int):
+    """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.
+
+    As a language model might split a single word into multiple tokens, we need to make
+    sure that the start and end value of first and last sub-token matches the
+    start and end value of the token return by the WhitespaceTokenizer as the
+    entities are using those start and end values.
+    """
+
+    tokens_out = []
+
+    current_token_offset = token_start
+
+    for index, string in enumerate(tokens_in):
+        if index == 0:
+            if index == len(tokens_in) - 1:
+                s_token_end = token_end
+            else:
+                s_token_end = current_token_offset + len(string)
+            tokens_out.append(Token(string, token_start, end=s_token_end))
+        elif index == len(tokens_in) - 1:
+            tokens_out.append(Token(string, current_token_offset, end=token_end))
+        else:
+            tokens_out.append(
+                Token(
+                    string, current_token_offset, end=current_token_offset + len(string)
+                )
+            )
+
+        current_token_offset += len(string)
+
+    return tokens_out

From d10d73fe4132fbb50c20335c732626bea418be7b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 5 Feb 2020 18:07:28 +0100
Subject: [PATCH 302/633] refactor spacy doc name

---
 .../extractors/test_crf_entity_extractor.py    | 18 +++++++++---------
 .../extractors/test_spacy_entity_extractors.py |  4 ++--
 tests/nlu/featurizers/test_regex_featurizer.py |  2 +-
 tests/nlu/featurizers/test_spacy_featurizer.py |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index 8c832894fd25..8dd9aa72db8c 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -15,7 +15,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
                 "entities": [
                     {"start": 16, "end": 20, "value": "west", "entity": "location"}
                 ],
-                "spacy_doc": spacy_nlp("anywhere in the west"),
+                "text_spacy_doc": spacy_nlp("anywhere in the west"),
             },
         ),
         Message(
@@ -38,7 +38,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
                         "extractor": "CRFEntityExtractor",
                     },
                 ],
-                "spacy_doc": spacy_nlp("central indian restaurant"),
+                "text_spacy_doc": spacy_nlp("central indian restaurant"),
             },
         ),
     ]
@@ -46,7 +46,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
     # uses BILOU and the default features
     ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
     sentence = "anywhere in the west"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
+    doc = {"text_spacy_doc": spacy_nlp(sentence)}
     crf_format = ext._from_text_to_crf(Message(sentence, doc))
     assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"]
     feats = ext._sentence_to_features(crf_format)
@@ -54,7 +54,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
     assert "EOS" in feats[-1]
     assert feats[1]["0:low"] == "in"
     sentence = "anywhere in the west"
-    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
+    ext.extract_entities(Message(sentence, {"text_spacy_doc": spacy_nlp(sentence)}))
     filtered = ext.filter_trainable_entities(examples)
     assert filtered[0].get("entities") == [
         {"start": 16, "end": 20, "value": "west", "entity": "location"}
@@ -82,7 +82,7 @@ def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
 
     ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
     sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
+    doc = {"text_spacy_doc": spacy_nlp(sentence)}
     r = ext._from_crf_to_json(
         Message(sentence, doc),
         [
@@ -113,7 +113,7 @@ def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
     ner_crf_pos_feature_config.update({"BILOU_flag": False})
     ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
     sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
+    doc = {"text_spacy_doc": spacy_nlp(sentence)}
     rs = ext._from_crf_to_json(
         Message(sentence, doc),
         [
@@ -173,7 +173,7 @@ def test_crf_create_entity_dict(spacy_nlp):
                             },
                         }
                     ],
-                    "spacy_doc": spacy_nlp("where is St. Michael's Hospital?"),
+                    "text_spacy_doc": spacy_nlp("where is St. Michael's Hospital?"),
                 },
             )
         },
@@ -198,7 +198,7 @@ def test_crf_create_entity_dict(spacy_nlp):
                             },
                         }
                     ],
-                    "spacy_doc": spacy_nlp("where is Children's Hospital?"),
+                    "text_spacy_doc": spacy_nlp("where is Children's Hospital?"),
                 },
             )
         },
@@ -244,7 +244,7 @@ def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
 
     text = "Rasa is a company in Berlin"
     message = Message(text)
-    message.set("spacy_doc", spacy_nlp(text))
+    message.set("text_spacy_doc", spacy_nlp(text))
 
     white_space_tokenizer.process(message)
     spacy_featurizer.process(message)
diff --git a/tests/nlu/extractors/test_spacy_entity_extractors.py b/tests/nlu/extractors/test_spacy_entity_extractors.py
index 0c5e59ae5b7a..f417ae0f422d 100644
--- a/tests/nlu/extractors/test_spacy_entity_extractors.py
+++ b/tests/nlu/extractors/test_spacy_entity_extractors.py
@@ -10,7 +10,7 @@ def test_spacy_ner_extractor(component_builder, spacy_nlp):
         {
             "intent": "restaurant_search",
             "entities": [],
-            "spacy_doc": spacy_nlp("anywhere in the west"),
+            "text_spacy_doc": spacy_nlp("anywhere in the west"),
         },
     )
 
@@ -33,7 +33,7 @@ def test_spacy_ner_extractor(component_builder, spacy_nlp):
         {
             "intent": "example_intent",
             "entities": [],
-            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"),
+            "text_spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"),
         },
     )
     _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]})
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index dcc9b80e107d..e49f04934700 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -141,7 +141,7 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     component_config = {"name": "SpacyTokenizer"}
     tokenizer = SpacyTokenizer(component_config)
     message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
+    message.set("text_spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
     result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index e13acd4a0312..5c3bf5220578 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -105,7 +105,7 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
     greet = {"intent": "greet", "text_features": [0.5]}
 
     message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
+    message.set("text_spacy_doc", doc)
 
     ftr._set_spacy_features(message)
 

From 1d931d73d0069801ba67d92ed4c58ea099b1cdf2 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 6 Feb 2020 11:28:37 +0100
Subject: [PATCH 303/633] improve exception errors, rename d_model, accept
 training None

---
 rasa/utils/tensorflow/layers.py      |  28 +++--
 rasa/utils/tensorflow/transformer.py | 153 ++++++++++++++++-----------
 2 files changed, 113 insertions(+), 68 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 1d29c884cf35..0579f70f4bbe 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -1,14 +1,20 @@
 import logging
-from typing import List, Optional, Text, Tuple, Callable
+from typing import List, Optional, Text, Tuple, Callable, Union
 import tensorflow as tf
 import tensorflow_addons as tfa
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
 
 logger = logging.getLogger(__name__)
 
 
 class SparseDropout(tf.keras.layers.Dropout):
-    def call(self, inputs: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
+    def call(
+        self, inputs: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None
+    ) -> tf.Tensor:
+        if training is None:
+            training = K.learning_phase()
+
         def dropped_inputs() -> tf.Tensor:
             to_retain_prob = tf.random.uniform(
                 tf.shape(inputs.values), 0, 1, inputs.values.dtype
@@ -100,7 +106,12 @@ def __init__(
             )
             self._ffn_layers.append(tf.keras.layers.Dropout(dropout_rate))
 
-    def call(self, x: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
+    def call(
+        self, x: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None
+    ) -> tf.Tensor:
+        if training is None:
+            training = K.learning_phase()
+
         for layer in self._ffn_layers:
             x = layer(x, training=training)
 
@@ -126,11 +137,11 @@ def __init__(
                 f"should be 'cosine' or 'inner'"
             )
 
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._dense = tf.keras.layers.Dense(
             units=embed_dim,
             activation=None,
-            kernel_regularizer=l2_regularizer,
+            kernel_regularizer=regularizer,
             name=f"embed_layer_{layer_name_suffix}",
         )
 
@@ -150,9 +161,14 @@ def build(self, input_shape: tf.TensorShape) -> None:
         self.built = True
 
     def call(
-        self, x: tf.Tensor, mask: tf.Tensor, training: tf.Tensor
+        self,
+        x: tf.Tensor,
+        mask: tf.Tensor,
+        training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Randomly mask input sequences."""
+        if training is None:
+            training = K.learning_phase()
 
         lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
         lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index d9bd09b70d7f..5e3417f0a916 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -1,7 +1,8 @@
-from typing import List, Optional, Text, Tuple, Callable
+from typing import List, Optional, Text, Tuple, Union
 import tensorflow as tf
 import tensorflow_addons as tfa
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
 import numpy as np
 from rasa.utils.tensorflow.layers import DenseWithSparseWeights
 
@@ -11,7 +12,7 @@
 class MultiHeadAttention(tf.keras.layers.Layer):
     def __init__(
         self,
-        d_model: int,
+        units: int,
         num_heads: int,
         attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
@@ -22,8 +23,14 @@ def __init__(
     ) -> None:
         super().__init__()
 
+        if units % num_heads != 0:
+            raise ValueError(
+                f"number of units {units} should be proportional to "
+                f"number of attention heads {num_heads}."
+            )
+
         self.num_heads = num_heads
-        self.d_model = d_model
+        self.units = units
         self.attention_dropout_rate = attention_dropout_rate
         self.unidirectional = unidirectional
         self.use_key_relative_position = use_key_relative_position
@@ -33,21 +40,23 @@ def __init__(
             self.relative_length += 1  # include current time
         self.heads_share_relative_embedding = heads_share_relative_embedding
 
-        assert d_model % self.num_heads == 0
-
-        self._depth = d_model // self.num_heads
+        self._depth = units // self.num_heads
 
-        self._wq = DenseWithSparseWeights(units=d_model, use_bias=False)
-        self._wk = DenseWithSparseWeights(units=d_model, use_bias=False)
-        self._wv = DenseWithSparseWeights(units=d_model, use_bias=False)
+        self._wq = DenseWithSparseWeights(units=units, use_bias=False)
+        self._wk = DenseWithSparseWeights(units=units, use_bias=False)
+        self._wv = DenseWithSparseWeights(units=units, use_bias=False)
 
-        self._dense = DenseWithSparseWeights(units=d_model)
+        self._dense = DenseWithSparseWeights(units=units)
 
         self._create_relative_embeddings()
 
     def _create_relative_embeddings(self) -> None:
         """Create relative embeddings."""
 
+        relative_embedding_shape = None
+        self.key_relative_embeddings = None
+        self.value_relative_embeddings = None
+
         if self.use_key_relative_position or self.use_value_relative_position:
             if not self.relative_length:
                 raise ValueError(
@@ -68,22 +77,16 @@ def _create_relative_embeddings(self) -> None:
                     relative_length,
                     self._depth,
                 )
-        else:
-            relative_embedding_shape = None
 
         if self.use_key_relative_position:
             self.key_relative_embeddings = self.add_weight(
                 shape=relative_embedding_shape, name="key_relative_embeddings",
             )
-        else:
-            self.key_relative_embeddings = None
 
         if self.use_value_relative_position:
             self.value_relative_embeddings = self.add_weight(
                 shape=relative_embedding_shape, name="value_relative_embeddings",
             )
-        else:
-            self.value_relative_embeddings = None
 
     def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
         # pad the left side to length
@@ -123,7 +126,10 @@ def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
         x_dim = len(x.shape)
 
         if x_dim < 4 or x_dim > 5:
-            raise ValueError("Relative tensor has a wrong shape.")
+            raise ValueError(
+                f"Relative tensor has a wrong shape {x.shape}, "
+                f"it should have 4 or 5 dimensions."
+            )
         if x_dim == 4:
             # add fake depth dimension
             x = tf.expand_dims(x, axis=-1)
@@ -292,20 +298,23 @@ def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
             x, perm=[0, 2, 1, 3]
         )  # (batch_size, seq_len_q, num_heads, depth)
         return tf.reshape(
-            x, (tf.shape(x)[0], -1, self.d_model)
-        )  # (batch_size, seq_len_q, d_model)
+            x, (tf.shape(x)[0], -1, self.units)
+        )  # (batch_size, seq_len_q, units)
 
     def call(
         self,
         v: tf.Tensor,
         k: tf.Tensor,
         q: tf.Tensor,
-        pad_mask: Optional[tf.Tensor],
-        training: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+        training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
-        q = self._wq(q)  # (batch_size, seq_len_q, d_model)
-        k = self._wk(k)  # (batch_size, seq_len_k, d_model)
-        v = self._wv(v)  # (batch_size, seq_len_v, d_model)
+        if training is None:
+            training = K.learning_phase()
+
+        q = self._wq(q)  # (batch_size, seq_len_q, units)
+        k = self._wk(k)  # (batch_size, seq_len_k, units)
+        v = self._wv(v)  # (batch_size, seq_len_v, units)
 
         q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
         k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
@@ -316,9 +325,9 @@ def call(
         )
         # attention.shape == (batch_size, num_heads, seq_len_q, depth)
         # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
-        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, d_model)
+        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, units)
 
-        output = self._dense(attention)  # (batch_size, seq_len_q, d_model)
+        output = self._dense(attention)  # (batch_size, seq_len_q, units)
 
         return output, attention_weights
 
@@ -326,9 +335,9 @@ def call(
 class TransformerEncoderLayer(tf.keras.layers.Layer):
     def __init__(
         self,
-        d_model: int,
+        units: int,
         num_heads: int,
-        dff: int,
+        filter_units: int,
         dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
         unidirectional: bool = False,
@@ -341,7 +350,7 @@ def __init__(
 
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
         self._mha = MultiHeadAttention(
-            d_model,
+            units,
             num_heads,
             attention_dropout_rate,
             unidirectional,
@@ -355,34 +364,44 @@ def __init__(
         self._ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
             DenseWithSparseWeights(
-                units=dff, activation=tfa.activations.gelu
-            ),  # (batch_size, seq_len, dff)
+                units=filter_units, activation=tfa.activations.gelu
+            ),  # (batch_size, seq_len, filter_units)
             tf.keras.layers.Dropout(dropout_rate),
-            DenseWithSparseWeights(units=d_model),  # (batch_size, seq_len, d_model)
+            DenseWithSparseWeights(units=units),  # (batch_size, seq_len, units)
             tf.keras.layers.Dropout(dropout_rate),
         ]
 
-    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
-        x_norm = self._layernorm(x)  # (batch_size, seq_len, d_model)
-        attn_out, _ = self._mha(x_norm, x_norm, x_norm, pad_mask, training=training)
+    def call(
+        self,
+        x: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+        training: Optional[Union[tf.Tensor, bool]] = None,
+    ) -> tf.Tensor:
+        if training is None:
+            training = K.learning_phase()
+
+        x_norm = self._layernorm(x)  # (batch_size, seq_len, units)
+        attn_out, _ = self._mha(
+            x_norm, x_norm, x_norm, pad_mask=pad_mask, training=training
+        )
         attn_out = self._dropout(attn_out, training=training)
         x += attn_out
 
-        ffn_out = x  # (batch_size, seq_len, d_model)
+        ffn_out = x  # (batch_size, seq_len, units)
         for layer in self._ffn_layers:
             ffn_out = layer(ffn_out, training=training)
         x += ffn_out
 
-        return x  # (batch_size, seq_len, d_model)
+        return x  # (batch_size, seq_len, units)
 
 
 class TransformerEncoder(tf.keras.layers.Layer):
     def __init__(
         self,
         num_layers: int,
-        d_model: int,
+        units: int,
         num_heads: int,
-        dff: int,
+        filter_units: int,
         max_seq_length: int,
         reg_lambda: float,
         dropout_rate: float = 0.1,
@@ -396,23 +415,23 @@ def __init__(
     ) -> None:
         super().__init__(name=name)
 
-        self.d_model = d_model
+        self.units = units
         self.unidirectional = unidirectional
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._embedding = DenseWithSparseWeights(
-            units=d_model, kernel_regularizer=l2_regularizer
+            units=units, kernel_regularizer=l2_regularizer
         )
 
-        self._pos_encoding = self._positional_encoding(max_seq_length, self.d_model)
+        self._pos_encoding = self._positional_encoding(max_seq_length, self.units)
 
         self._dropout = tf.keras.layers.Dropout(dropout_rate)
 
         self._enc_layers = [
             TransformerEncoderLayer(
-                d_model,
+                units,
                 num_heads,
-                dff,
+                filter_units,
                 dropout_rate,
                 attention_dropout_rate,
                 unidirectional,
@@ -431,16 +450,16 @@ def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
         return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
 
     @staticmethod
-    def _get_angles(pos: np.ndarray, i: np.ndarray, d_model: int) -> np.ndarray:
-        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
+    def _get_angles(pos: np.ndarray, i: np.ndarray, units: int) -> np.ndarray:
+        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(units))
         return pos * angle_dropout_rates
 
     @classmethod
-    def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
+    def _positional_encoding(cls, max_position: int, units: int) -> tf.Tensor:
         angle_rads = cls._get_angles(
             np.arange(max_position)[:, np.newaxis],
-            np.arange(d_model)[np.newaxis, :],
-            d_model,
+            np.arange(units)[np.newaxis, :],
+            units,
         )
 
         # apply sin to even indices in the array; 2i
@@ -453,26 +472,36 @@ def _positional_encoding(cls, max_position: int, d_model: int) -> tf.Tensor:
 
         return tf.cast(pos_encoding, dtype=tf.float32)
 
-    def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Tensor:
+    def call(
+        self,
+        x: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+        training: Optional[Union[tf.Tensor, bool]] = None,
+    ) -> tf.Tensor:
+        if training is None:
+            training = K.learning_phase()
 
         # adding embedding and position encoding.
-        x = self._embedding(x)  # (batch_size, seq_len, d_model)
-        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
-        x += self._pos_encoding[:, : tf.shape(x)[1], :] * (1 - pad_mask)
+        x = self._embedding(x)  # (batch_size, seq_len, units)
+        x *= tf.math.sqrt(tf.cast(self.units, tf.float32))
+        if pad_mask is not None:
+            x += self._pos_encoding[:, : tf.shape(x)[1], :] * (1 - pad_mask)
         x = self._dropout(x, training=training)
 
-        pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
-        pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
-        if self.unidirectional:
-            # add look ahead pad mask to emulate unidirectional behavior
-            pad_mask = tf.minimum(
-                1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
-            )  # (batch_size, 1, seq_len, seq_len)
+        if pad_mask is not None:
+            pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
+            pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]
+            # pad_mask.shape = (batch_size, 1, 1, seq_len)
+            if self.unidirectional:
+                # add look ahead pad mask to emulate unidirectional behavior
+                pad_mask = tf.minimum(
+                    1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
+                )  # (batch_size, 1, seq_len, seq_len)
 
         for layer in self._enc_layers:
-            x = layer(x, pad_mask, training)  # (batch_size, seq_len, d_model)
+            x = layer(x, pad_mask=pad_mask, training=training)
 
         # if normalization is done in encoding layers, then it should also be done
         # on the output, since the output can grow very large, being the sum of
         # a whole stack of unnormalized layer outputs.
-        return self._layernorm(x)  # (batch_size, seq_len, d_model)
+        return self._layernorm(x)  # (batch_size, seq_len, units)

From 452368dbf72d0c38cbb9c7398f51bb2f209a2b3e Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Thu, 6 Feb 2020 11:58:28 +0100
Subject: [PATCH 304/633] Apply suggestions from code review

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 changelog/5187.feature.rst | 6 +++---
 docs/nlu/components.rst    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/changelog/5187.feature.rst b/changelog/5187.feature.rst
index 41e1ef79144c..41f348c240e8 100644
--- a/changelog/5187.feature.rst
+++ b/changelog/5187.feature.rst
@@ -1,7 +1,7 @@
 Integrate language models from HuggingFace's Transformers Library.
 
-Create a new NLP component ``HFTransformersNLP`` which tokenizes and featurizes incoming messages using a specified
+Add a new NLP component ``HFTransformersNLP`` which tokenizes and featurizes incoming messages using a specified
 pre-trained model with the Transformers library as the backend.
-Create ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from HFTransformersNLP and
+Add ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from HFTransformersNLP and
 sets them correctly for message object.
-Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa
\ No newline at end of file
+Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 973c635ebba6..baaabaea6f64 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -88,7 +88,7 @@ SpacyNLP
 HFTransformersNLP
 ~~~~~~~~~~~~~~~~~
 
-:Short: Transformers based pre-trained language model initializer
+:Short: HuggingFace's Transformers based pre-trained language model initializer
 :Outputs: nothing
 :Requires: nothing
 :Description:

From 38b6a01e0952ee7fb03efe022dc59419f1bbdffb Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 6 Feb 2020 14:46:30 +0100
Subject: [PATCH 305/633] added new components to test pipelines

---
 rasa/nlu/utils/hugging_face/hf_transformers.py | 2 +-
 tests/nlu/training/test_train.py               | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 95b1dbb8ed3c..413f2bd730ad 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -34,7 +34,7 @@ class HFTransformersNLP(Component):
 
     defaults = {
         # name of the language model to load.
-        "model_name": None,
+        "model_name": "bert",
         # Pre-Trained weights to be loaded(string)
         "model_weights": None,
     }
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index d88436564e2e..c4bb193ea3f2 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -27,16 +27,19 @@ def pipelines_for_tests():
             as_pipeline(
                 "SpacyNLP",
                 "MitieNLP",
+                "HFTransformersNLP",
                 "WhitespaceTokenizer",
                 "ConveRTTokenizer",
                 "MitieTokenizer",
                 "SpacyTokenizer",
+                "LanguageModelTokenizer",
                 "MitieFeaturizer",
                 "SpacyFeaturizer",
                 "RegexFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
                 "ConveRTFeaturizer",
+                "LanguageModelFeaturizer",
                 "MitieEntityExtractor",
                 "CRFEntityExtractor",
                 "SpacyEntityExtractor",

From 9d63670e58e0453a68e3e3e20eb6a26b69fec6bc Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 7 Feb 2020 17:01:58 +0100
Subject: [PATCH 306/633] implemented first level of tensorflow configuration

---
 rasa/__main__.py                  |  4 ++
 rasa/utils/tensorflow/__init__.py | 70 +++++++++++++++++++++++++++++++
 rasa/utils/train_utils.py         |  9 ----
 3 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/rasa/__main__.py b/rasa/__main__.py
index 30fd6d1576cc..9c0017d4d383 100644
--- a/rasa/__main__.py
+++ b/rasa/__main__.py
@@ -8,6 +8,7 @@
 from rasa.cli.arguments.default_arguments import add_logging_options
 from rasa.cli.utils import parse_last_positional_argument_as_model_path
 from rasa.utils.common import set_log_level
+from rasa.utils.tensorflow import setup_tf_environment
 
 logger = logging.getLogger(__name__)
 
@@ -68,6 +69,9 @@ def main() -> None:
     )
     set_log_level(log_level)
 
+    # Set tensorflow environment
+    setup_tf_environment()
+
     # insert current path in syspath so custom modules are found
     sys.path.insert(1, os.getcwd())
 
diff --git a/rasa/utils/tensorflow/__init__.py b/rasa/utils/tensorflow/__init__.py
index e69de29bb2d1..5bb0a5ee9ca8 100644
--- a/rasa/utils/tensorflow/__init__.py
+++ b/rasa/utils/tensorflow/__init__.py
@@ -0,0 +1,70 @@
+import os
+import tensorflow as tf
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def setup_gpu_environment(gpu_memory_config):
+
+    if gpu_memory_config:
+
+        # Parse GPU config
+        # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
+        # Parse it and store in a dictionary
+        parsed_gpu_config = {
+            instance.split(":")[0].strip(): int(instance.split(":")[1].strip())
+            for instance in gpu_memory_config.split(",")
+        }
+
+        physical_gpus = tf.config.list_physical_devices("GPU")
+
+        # Logic taken from https://www.tensorflow.org/guide/gpu
+        if physical_gpus:
+
+            for gpu_id, gpu_id_memory in parsed_gpu_config.items():
+                try:
+                    tf.config.experimental.set_virtual_device_configuration(
+                        physical_gpus[int(gpu_id)],
+                        [
+                            tf.config.experimental.VirtualDeviceConfiguration(
+                                memory_limit=gpu_id_memory
+                            )
+                        ],
+                    )
+
+                except RuntimeError as e:
+                    # Virtual devices must be set before GPUs have been initialized
+                    raise RuntimeError(
+                        "Error while setting up tensorflow environment. "
+                        "Virtual devices must be set before GPUs have been initialized"
+                    )
+
+        else:
+            logger.info(
+                "You have an environment variable GPU_MEMORY_ALLOC set but no GPUs were detected to configure"
+            )
+
+
+def setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads):
+
+    if inter_op_parallel_threads:
+        tf.config.threading.set_inter_op_parallelism_threads(
+            int(inter_op_parallel_threads.strip())
+        )
+
+    if intra_op_parallel_threads:
+        tf.config.threading.set_intra_op_parallelism_threads(
+            int(intra_op_parallel_threads.strip())
+        )
+
+
+def setup_tf_environment():
+
+    # Get all env variables
+    gpu_memory_config = os.getenv("TF_GPU_MEMORY_ALLOC", None)
+    inter_op_parallel_threads = os.getenv("TF_INTER_OP_PARALLELISM_THREADS", None)
+    intra_op_parallel_threads = os.getenv("TF_INTRA_OP_PARALLELISM_THREADS", None)
+
+    setup_gpu_environment(gpu_memory_config)
+    setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 17ebcb124ab8..9477edc754dd 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -8,15 +8,6 @@
 logger = logging.getLogger(__name__)
 
 
-def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
-    """Prepare `tf.compat.v1.ConfigProto` for training"""
-
-    if config.get("tf_config") is not None:
-        return tf.compat.v1.ConfigProto(**config.pop("tf_config"))
-    else:
-        return None
-
-
 def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
     """Normalizes an array of positive numbers over the top `ranking_length` values.
     Other values will be set to 0.

From 4b1eb098029defdfe8bd76b51718a8f5a40b2fff Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 7 Feb 2020 17:07:29 +0000
Subject: [PATCH 307/633] split data with no label key

---
 rasa/utils/tensorflow/tf_model_data.py | 25 ++++++++++++++++++-------
 tests/utils/test_tf_model_data.py      | 18 +++++++++++++++++-
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index bb4df7240428..18664d4c3384 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -82,20 +82,31 @@ def split(
 
         self._check_label_key()
 
-        label_ids = self._create_label_ids(self.data[self.label_key][0])
-        label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
+        if self.label_key is None:
+            multi_values = [v for values in self.data.values() for v in values]
+            solo_values = [[] for values in self.data.values() for v in values]
+            stratify = None
+        else:
+            label_ids = self._create_label_ids(self.data[self.label_key][0])
+            label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
+
+            self._check_train_test_sizes(number_of_test_examples, label_counts)
 
-        self._check_train_test_sizes(number_of_test_examples, label_counts)
+            counts = np.array([label_counts[label] for label in label_ids])
+            multi_values = [
+                v[counts > 1] for values in self.data.values() for v in values
+            ]
+            solo_values = [
+                v[counts == 1] for values in self.data.values() for v in values
+            ]
 
-        counts = np.array([label_counts[label] for label in label_ids])
-        multi_values = [v[counts > 1] for values in self.data.values() for v in values]
-        solo_values = [v[counts == 1] for values in self.data.values() for v in values]
+            stratify = label_ids[counts > 1]
 
         output_values = train_test_split(
             *multi_values,
             test_size=number_of_test_examples,
             random_state=random_seed,
-            stratify=label_ids[counts > 1],
+            stratify=stratify,
         )
 
         return self._convert_train_test_split(output_values, solo_values)
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_tf_model_data.py
index 5f5d3009c634..a68dddf8563d 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_tf_model_data.py
@@ -77,7 +77,7 @@ def test_shuffle_session_data(model_data: RasaModelData):
     assert np.all(np.array(model_data.values()) != np.array(data.values()))
 
 
-def test_split_session_data_by_label(model_data: RasaModelData):
+def test_split_data_by_label(model_data: RasaModelData):
     split_model_data = model_data._split_by_label_ids(
         model_data.data, model_data.get("intent_ids")[0], np.array([0, 1])
     )
@@ -87,6 +87,22 @@ def test_split_session_data_by_label(model_data: RasaModelData):
         assert len(set(s.get("intent_ids")[0])) == 1
 
 
+def test_split_data_by_none_label(model_data: RasaModelData):
+    model_data.label_key = None
+
+    split_model_data = model_data.split(2, 42)
+
+    assert len(split_model_data) == 2
+
+    train_data = split_model_data[0]
+    test_data = split_model_data[1]
+
+    # train data should have 3 examples
+    assert len(train_data.get("intent_ids")[0]) == 3
+    # test data should have 2 examples
+    assert len(test_data.get("intent_ids")[0]) == 2
+
+
 def test_train_val_split(model_data: RasaModelData):
     train_model_data, test_model_data = model_data.split(2, 42)
 

From bcb6b9f835b72a167f44038ceccace92b2165722 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 7 Feb 2020 17:10:00 +0000
Subject: [PATCH 308/633] review comments

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/nlu/extractors/crf_entity_extractor.py         | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8ac3da8eca80..12b1191654e6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -49,7 +49,7 @@
 
 class EmbeddingIntentClassifier(DIETClassifier):
 
-    provides = [ENTITIES_ATTRIBUTE]
+    provides = ["intent", "intent_ranking"]
 
     requires = [
         any_of(
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 8bb2c0cbfc2e..57b16e83d85f 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -207,12 +207,6 @@ def load(
             data_example,
         ) = cls._load_from_files(meta, model_dir)
 
-        meta[LABEL_CLASSIFICATION] = False
-        meta[ENTITY_RECOGNITION] = True
-        meta[MASKED_LM] = False
-        meta[NUM_TRANSFORMER_LAYERS] = 0
-        meta[SHARE_HIDDEN_LAYERS] = False
-
         model = cls._load_model(inv_tag_dict, label_data, meta, data_example, model_dir)
 
         return cls(

From 7ccafc326e98dd71ad94821252f0e2a8d00a8133 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 9 Feb 2020 17:53:22 +0100
Subject: [PATCH 309/633] created new pipeline for failing tests

---
 tests/nlu/training/test_train.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index c4bb193ea3f2..86978b7da1a1 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -27,19 +27,16 @@ def pipelines_for_tests():
             as_pipeline(
                 "SpacyNLP",
                 "MitieNLP",
-                "HFTransformersNLP",
                 "WhitespaceTokenizer",
                 "ConveRTTokenizer",
                 "MitieTokenizer",
                 "SpacyTokenizer",
-                "LanguageModelTokenizer",
                 "MitieFeaturizer",
                 "SpacyFeaturizer",
                 "RegexFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
                 "ConveRTFeaturizer",
-                "LanguageModelFeaturizer",
                 "MitieEntityExtractor",
                 "CRFEntityExtractor",
                 "SpacyEntityExtractor",
@@ -63,6 +60,19 @@ def pipelines_for_tests():
                 "KeywordIntentClassifier",
             ),
         ),
+        (
+            # Created another test pipeline because dense featurizers can't co-exist in the same pipeline together,
+            # because of the mismatch in sequence length their tokenizers break the incoming message into.
+            "en",
+            as_pipeline(
+                "HFTransformersNLP",
+                "LanguageModelTokenizer",
+                "LanguageModelFeaturizer",
+                "CRFEntityExtractor",
+                "DIETClassifier",
+                "ResponseSelector",
+            ),
+        ),
     ]
 
 

From 6d9c88663c84cf7e3bc79dbb522bc875f934e544 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 10 Feb 2020 10:39:00 +0100
Subject: [PATCH 310/633] separate pipeline for convert as well

---
 tests/nlu/training/test_train.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 86978b7da1a1..786f48d27012 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -28,7 +28,6 @@ def pipelines_for_tests():
                 "SpacyNLP",
                 "MitieNLP",
                 "WhitespaceTokenizer",
-                "ConveRTTokenizer",
                 "MitieTokenizer",
                 "SpacyTokenizer",
                 "MitieFeaturizer",
@@ -36,7 +35,6 @@ def pipelines_for_tests():
                 "RegexFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
-                "ConveRTFeaturizer",
                 "MitieEntityExtractor",
                 "CRFEntityExtractor",
                 "SpacyEntityExtractor",
@@ -60,14 +58,29 @@ def pipelines_for_tests():
                 "KeywordIntentClassifier",
             ),
         ),
+        # Create separate test pipelines for dense featurizers because they can't co-exist in the same pipeline
+        # together, as their tokenizers break the incoming message into different number of tokens.
         (
-            # Created another test pipeline because dense featurizers can't co-exist in the same pipeline together,
-            # because of the mismatch in sequence length their tokenizers break the incoming message into.
             "en",
             as_pipeline(
                 "HFTransformersNLP",
                 "LanguageModelTokenizer",
                 "LanguageModelFeaturizer",
+                "LexicalSyntacticFeaturizer",
+                "CountVectorsFeaturizer",
+                "CRFEntityExtractor",
+                "DIETClassifier",
+                "ResponseSelector",
+            ),
+        ),
+        (
+            "en",
+            as_pipeline(
+                "ConveRTTokenizer",
+                "ConveRTFeaturizer",
+                "LanguageModelFeaturizer",
+                "LexicalSyntacticFeaturizer",
+                "CountVectorsFeaturizer",
                 "CRFEntityExtractor",
                 "DIETClassifier",
                 "ResponseSelector",

From e5e88cc654494734caed6a5640f4c6c14b2614a3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 10 Feb 2020 13:17:08 +0100
Subject: [PATCH 311/633] set label_key to None

---
 rasa/core/policies/embedding_policy.py      | 2 --
 rasa/nlu/classifiers/diet_classifier.py     | 4 ++--
 rasa/nlu/extractors/crf_entity_extractor.py | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 5c650c976fd4..c931b96d1f62 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -44,8 +44,6 @@ class EmbeddingPolicy(TEDPolicy):
     The policy used in our paper https://arxiv.org/abs/1910.00486
     """
 
-    SUPPORTS_ONLINE_TRAINING = True
-
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c030abba06c4..31a683e1b4e1 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -247,7 +247,7 @@ def __init__(
         self.data_example = None
 
         self.label_key = (
-            "label_ids" if self.component_config[LABEL_CLASSIFICATION] else "tag_ids"
+            "label_ids" if self.component_config[LABEL_CLASSIFICATION] else None
         )
 
     # training data helpers:
@@ -856,7 +856,7 @@ def _load_model(
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        label_key = "label_ids" if meta[LABEL_CLASSIFICATION] else "tag_ids"
+        label_key = "label_ids" if meta[LABEL_CLASSIFICATION] else None
         model_data_example = RasaModelData(label_key=label_key, data=data_example)
 
         model = DIET.load(
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 57b16e83d85f..db743e05fa48 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -146,7 +146,7 @@ def __init__(
 
         raise_warning(
             f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in"
-            f"combination with 'LexicalSyntacticFeaturizer'.",
+            f"combination with 'LexicalSyntacticFeaturizer' instead.",
             category=DeprecationWarning,
             docs=f"{DOCS_BASE_URL}/nlu/components/",
         )

From 65cd1a73a045a7174ed3eca54b01681e2d82de43 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 10 Feb 2020 14:27:25 +0100
Subject: [PATCH 312/633] add test

---
 .../extractors/test_crf_entity_extractor.py   | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 tests/nlu/extractors/test_crf_entity_extractor.py

diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
new file mode 100644
index 000000000000..be956e6c5c17
--- /dev/null
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -0,0 +1,61 @@
+from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, ENTITIES_ATTRIBUTE
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+
+
+def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
+    examples = [
+        Message(
+            "anywhere in the west",
+            {
+                "intent": "restaurant_search",
+                "entities": [
+                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
+                ],
+                "spacy_doc": spacy_nlp("anywhere in the west"),
+            },
+        ),
+        Message(
+            "central indian restaurant",
+            {
+                "intent": "restaurant_search",
+                "entities": [
+                    {
+                        "start": 0,
+                        "end": 7,
+                        "value": "central",
+                        "entity": "location",
+                        "extractor": "random_extractor",
+                    },
+                    {
+                        "start": 8,
+                        "end": 14,
+                        "value": "indian",
+                        "entity": "cuisine",
+                        "extractor": "CRFEntityExtractor",
+                    },
+                ],
+                "spacy_doc": spacy_nlp("central indian restaurant"),
+            },
+        ),
+    ]
+
+    extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+    tokenizer = WhitespaceTokenizer()
+
+    training_data = TrainingData(training_examples=examples)
+    tokenizer.train(training_data)
+    extractor.train(training_data)
+
+    sentence = "italian restaurant"
+    message = Message(sentence, {SPACY_DOCS[TEXT_ATTRIBUTE]: spacy_nlp(sentence)})
+
+    tokenizer.process(message)
+    extractor.process(message)
+
+    detected_entities = message.get(ENTITIES_ATTRIBUTE)
+
+    assert len(detected_entities) == 1
+    assert detected_entities[0]["entity"] == "cuisine"
+    assert detected_entities[0]["value"] == "italian"

From 1d921c6d5846511e2133de0cfb9850b1f40f7583 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 10 Feb 2020 14:38:41 +0100
Subject: [PATCH 313/633] update warnings

---
 rasa/core/policies/embedding_policy.py              | 2 +-
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/nlu/extractors/crf_entity_extractor.py         | 4 ++--
 rasa/utils/train_utils.py                           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index c931b96d1f62..4388ea31815c 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -124,6 +124,6 @@ def __init__(
 
         raise_warning(
             f"'EmbeddingPolicy' is deprecated. Use 'TEDPolicy' instead.",
-            category=DeprecationWarning,
+            category=FutureWarning,
             docs=f"{DOCS_BASE_URL}/core/policies/",
         )
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 12b1191654e6..4911d2d8aeac 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -149,6 +149,6 @@ def __init__(
 
         raise_warning(
             f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead.",
-            category=DeprecationWarning,
+            category=FutureWarning,
             docs=f"{DOCS_BASE_URL}/nlu/components/",
         )
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index db743e05fa48..2d97fda3f172 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -145,9 +145,9 @@ def __init__(
         )
 
         raise_warning(
-            f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in"
+            f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in "
             f"combination with 'LexicalSyntacticFeaturizer' instead.",
-            category=DeprecationWarning,
+            category=FutureWarning,
             docs=f"{DOCS_BASE_URL}/nlu/components/",
         )
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 46940ade1be9..9152e17beb6d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -69,7 +69,7 @@ def _replace_deprecated_option(
 ) -> Dict[Text, Any]:
     if old_option in config:
         logger.warning(
-            f"Option '{old_option}' got renamed to {new_option}. "
+            f"Option '{old_option}' got renamed to '{new_option}'. "
             f"Please update your configuration file."
         )
         config[new_option] = config[old_option]

From bb05158162c177e6e71439ea3d3de0cf6d1cc628 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 10 Feb 2020 15:01:57 +0100
Subject: [PATCH 314/633] Update filename for crf featurizer.

---
 rasa/nlu/extractors/crf_entity_extractor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 2d97fda3f172..5f865e4c1321 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -173,7 +173,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
 
-        self.featurizer.persist(file_name, model_dir)
+        self.featurizer.persist(file_name + ".featurizer", model_dir)
 
         return super().persist(file_name, model_dir)
 
@@ -194,8 +194,10 @@ def load(
             )
             return cls(component_config=meta)
 
+        featurizer_meta = meta.copy()
+        featurizer_meta["file"] += ".featurizer"
         featurizer = LexicalSyntacticFeaturizer.load(
-            meta, model_dir, model_metadata, cached_component, **kwargs
+            featurizer_meta, model_dir, model_metadata, cached_component, **kwargs
         )
 
         (

From 3b9a9689c49fc1472a8176825530a9a433370ec5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 15:10:29 +0100
Subject: [PATCH 315/633] fix neg sampling and add DIET2DIET to response
 selector

---
 rasa/nlu/classifiers/diet_classifier.py       | 240 ++++++++++++------
 rasa/nlu/registry.py                          |   2 +-
 ...ponse_selector.py => response_selector.py} | 149 ++++++++++-
 rasa/utils/tensorflow/tf_layers.py            |   9 +-
 rasa/utils/tensorflow/tf_model_data.py        |   2 +-
 rasa/utils/tensorflow/tf_models.py            |   6 +-
 tests/nlu/base/test_evaluation.py             |   2 +-
 7 files changed, 312 insertions(+), 98 deletions(-)
 rename rasa/nlu/selectors/{embedding_response_selector.py => response_selector.py} (65%)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c43de8254c99..d03a11b53735 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -192,6 +192,15 @@ def _check_config_parameters(self) -> None:
             self.component_config
         )
 
+        if (
+            self.component_config[MASKED_LM]
+            and not self.component_config[NUM_TRANSFORMER_LAYERS]
+        ):
+            raise ValueError(
+                "If number of transformer layers is 0,"
+                "'use_masked_language_model' option should be 'False'."
+            )
+
         if self.component_config[INTENT_CLASSIFICATION]:
             if (
                 self.component_config[SHARE_HIDDEN_LAYERS]
@@ -250,6 +259,10 @@ def __init__(
             "label_ids" if self.component_config[INTENT_CLASSIFICATION] else "tag_ids"
         )
 
+    @staticmethod
+    def model_name():
+        return DIET
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -343,9 +356,9 @@ def _extract_and_add_features(
     def check_input_dimension_consistency(self, model_data: RasaModelData):
         if self.component_config[SHARE_HIDDEN_LAYERS]:
             num_text_features = model_data.get_feature_dimension("text_features")
-            num_intent_features = model_data.get_feature_dimension("label_features")
+            num_label_features = model_data.get_feature_dimension("label_features")
 
-            if num_text_features != num_intent_features:
+            if num_text_features != num_label_features:
                 raise ValueError(
                     "If embeddings are shared text features and label features "
                     "must coincide. Check the output dimensions of previous components."
@@ -422,6 +435,12 @@ def _create_label_data(
 
         label_data = RasaModelData()
         label_data.add_features("label_features", features)
+
+        label_ids = np.array([idx for (idx, _) in labels_idx_example])
+        # explicitly add last dimension to label_ids
+        # to track correctly dynamic sequences
+        label_data.add_features("label_ids", [np.expand_dims(label_ids, -1)])
+
         label_data.add_mask("label_mask", "label_features")
 
         return label_data
@@ -580,7 +599,7 @@ def train(
         # keep one example for persisting and loading
         self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
-        self.model = DIET(
+        self.model = self.model_name()(
             model_data.get_signature(),
             self._label_data,
             self.inverted_tag_dict,
@@ -859,7 +878,7 @@ def _load_model(
         label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else "tag_ids"
         model_data_example = RasaModelData(label_key=label_key, data=data_example)
 
-        model = DIET.load(
+        model = cls.model_name().load(
             tf_model_file,
             model_data_example,
             data_signature=model_data_example.get_signature(),
@@ -905,6 +924,12 @@ def __init__(
         self._num_tags = len(inverted_tag_dict) if inverted_tag_dict is not None else 0
 
         self.config = config
+        if self.config[SHARE_HIDDEN_LAYERS]:
+            self.text_name = "text_label"
+            self.label_name = "text_label"
+        else:
+            self.text_name = "text"
+            self.label_name = "label"
 
         # tf objects
         self._tf_layers = {}
@@ -937,68 +962,72 @@ def _update_metrics_to_log(self) -> None:
             self.metrics_to_log += ["e_loss", "e_f1"]
 
     def _prepare_layers(self) -> None:
-        self._prepare_sequence_layers()
+        self._prepare_sequence_layers(self.text_name)
         if self.config[MASKED_LM]:
-            self._prepare_mask_lm_layers()
+            self._prepare_mask_lm_layers("text")
         if self.config[INTENT_CLASSIFICATION]:
-            self._prepare_intent_classification_layers()
+            self._prepare_input_layers(self.label_name)
+            self._prepare_label_classification_layers()
         if self.config[ENTITY_RECOGNITION]:
             self._prepare_entity_recognition_layers()
 
-    @staticmethod
-    def _create_sparse_dense_layer(
+    def _prepare_sparse_dense_layers(
+        self,
         feature_signatures: List[FeatureSignature],
         name: Text,
         reg_lambda: float,
         dense_dim: int,
-    ) -> Optional[tf_layers.DenseForSparse]:
-
+    ) -> None:
         sparse = False
+        dense = False
         for is_sparse, shape in feature_signatures:
             if is_sparse:
-                sparse = is_sparse
+                sparse = True
             else:
+                dense = True
                 # if dense features are present
                 # use the feature dimension of the dense features
                 dense_dim = shape[-1]
 
         if sparse:
-            return tf_layers.DenseForSparse(
+            self._tf_layers[f"sparse_to_dense.{name}"] = tf_layers.DenseForSparse(
                 units=dense_dim, reg_lambda=reg_lambda, name=name
             )
+            if not dense:
+                # create dense labels for the input to use in negative sampling
+                self._tf_layers[
+                    f"sparse_to_dense_ids.{name}"
+                ] = tf_layers.DenseForSparse(
+                    units=2, trainable=False, name=f"sparse_to_dense_ids.{name}"
+                )
+
+    def _prepare_input_layers(self, name: Text) -> None:
+        if f"{name}_features" not in self.data_signature:
+            raise KeyError(
+                f"Features for {name} are not present "
+                f"in data signature {self.data_signature}"
+            )
 
-    def _prepare_sequence_layers(self) -> None:
-        self._tf_layers["sparse_dropout"] = tf_layers.SparseDropout(
+        self._tf_layers[f"sparse_dropout.{name}"] = tf_layers.SparseDropout(
             rate=self.config[DROPRATE]
         )
-        if "text_features" in self.data_signature:
-            self._tf_layers["sparse_to_dense.text"] = self._create_sparse_dense_layer(
-                self.data_signature["text_features"],
-                "text",
-                self.config[C2],
-                self.config[DENSE_DIM]["text"],
-            )
-        if "label_features" in self.data_signature:
-            self._tf_layers["sparse_to_dense.label"] = self._create_sparse_dense_layer(
-                self.data_signature["label_features"],
-                "label",
-                self.config[C2],
-                self.config[DENSE_DIM]["label"],
-            )
-        self._tf_layers["ffnn.text"] = tf_layers.Ffnn(
-            self.config[HIDDEN_LAYERS_SIZES_TEXT],
+        self._prepare_sparse_dense_layers(
+            self.data_signature[f"{name}_features"],
+            name,
+            self.config[C2],
+            self.config[DENSE_DIM][name],
+        )
+        self._tf_layers[f"ffnn.{name}"] = tf_layers.Ffnn(
+            self.config[HIDDEN_LAYERS_SIZES_LABEL],
             self.config[DROPRATE],
             self.config[C2],
-            "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "text",
+            name,
         )
-        if self.config[INTENT_CLASSIFICATION]:
-            self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
-                self.config[HIDDEN_LAYERS_SIZES_LABEL],
-                self.config[DROPRATE],
-                self.config[C2],
-                "text_intent" if self.config[SHARE_HIDDEN_LAYERS] else "label",
-            )
-        self._tf_layers["transformer"] = (
+
+    def _prepare_sequence_layers(self, name: Text) -> None:
+        self._prepare_input_layers(name)
+
+        self._tf_layers[f"{name}_transformer"] = (
             tf_layers.TransformerEncoder(
                 self.config[NUM_TRANSFORMER_LAYERS],
                 self.config[TRANSFORMER_SIZE],
@@ -1009,27 +1038,27 @@ def _prepare_sequence_layers(self) -> None:
                 dropout_rate=self.config[DROPRATE],
                 attention_dropout_rate=self.config[DROPRATE],
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
-                name="text_encoder",
+                name=f"{name}_encoder",
             )
             if self.config[NUM_TRANSFORMER_LAYERS] > 0
             else lambda x, mask, training: x
         )
 
-    def _prepare_mask_lm_layers(self) -> None:
-        self._tf_layers["input_mask"] = tf_layers.InputMask()
-        self._tf_layers["embed.lm_mask"] = tf_layers.Embed(
+    def _prepare_mask_lm_layers(self, name: Text) -> None:
+        self._tf_layers[f"{name}_input_mask"] = tf_layers.InputMask()
+        self._tf_layers[f"embed.{name}_lm_mask"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "lm_mask",
+            f"{name}_lm_mask",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed.golden_token"] = tf_layers.Embed(
+        self._tf_layers[f"embed.{name}_golden_token"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "golden_token",
+            f"{name}_golden_token",
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["loss.mask"] = tf_layers.DotProductLoss(
+        self._tf_layers[f"loss.{name}_mask"] = tf_layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MU_POS],
@@ -1041,7 +1070,7 @@ def _prepare_mask_lm_layers(self) -> None:
             parallel_iterations=1 if self.random_seed is not None else 1000,
         )
 
-    def _prepare_intent_classification_layers(self) -> None:
+    def _prepare_label_classification_layers(self) -> None:
         self._tf_layers["embed.text"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
@@ -1082,7 +1111,7 @@ def _get_sequence_lengths(mask: tf.Tensor) -> tf.Tensor:
 
     def _combine_sparse_dense_features(
         self,
-        features: List[Union[tf.Tensor, tf.SparseTensor]],
+        features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]],
         mask: tf.Tensor,
         name: Text,
         sparse_dropout: bool = False,
@@ -1093,7 +1122,7 @@ def _combine_sparse_dense_features(
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 if sparse_dropout:
-                    _f = self._tf_layers["sparse_dropout"](f, self._training)
+                    _f = self._tf_layers[f"sparse_dropout.{name}"](f, self._training)
                 else:
                     _f = f
                 dense_features.append(self._tf_layers[f"sparse_to_dense.{name}"](_f))
@@ -1102,47 +1131,76 @@ def _combine_sparse_dense_features(
 
         return tf.concat(dense_features, axis=-1) * mask
 
+    def _features_as_seq_ids(
+        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text,
+    ) -> tf.Tensor:
+        # if there are dense features it's enough
+        for f in features:
+            if not isinstance(f, tf.SparseTensor):
+                return tf.stop_gradient(f)
+
+        # we need dense labels for negative sampling
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                return tf.stop_gradient(
+                    self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
+                )
+
     def _create_bow(
         self,
-        features: List[Union[tf.Tensor, "tf.SparseTensor"]],
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
         mask: tf.Tensor,
         name: Text,
         sparse_dropout: bool = False,
     ) -> tf.Tensor:
 
         x = self._combine_sparse_dense_features(features, mask, name, sparse_dropout)
-        return self._tf_layers[f"ffnn.{name}"](tf.reduce_sum(x, 1), self._training)
+        x = tf.reduce_sum(x, 1)  # convert to bag-of-words
+        return self._tf_layers[f"ffnn.{name}"](x, self._training)
 
     def _create_sequence(
         self,
-        features: List[Union[tf.Tensor, "tf.SparseTensor"]],
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
         mask: tf.Tensor,
         name: Text,
         masked_lm_loss: bool = False,
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        sequence_ids: bool = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
+        if sequence_ids:
+            x_seq_ids = self._features_as_seq_ids(features, name)
+        else:
+            x_seq_ids = None
+
         x = self._combine_sparse_dense_features(
             features, mask, name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT]
         )
 
+        pre = self._tf_layers[f"ffnn.{name}"](x, self._training)
+
         if masked_lm_loss:
-            pre, lm_mask_bool = self._tf_layers["input_mask"](x, mask, self._training)
+            pre, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
+                pre, mask, self._training
+            )
         else:
-            pre, lm_mask_bool = (x, None)
+            lm_mask_bool = None
 
-        transformed = self._tf_layers["transformer"](pre, 1 - mask, self._training)
+        transformed = self._tf_layers[f"{name}_transformer"](
+            pre, 1 - mask, self._training
+        )
         transformed = tfa.activations.gelu(transformed)
 
-        return transformed, x, lm_mask_bool
+        return transformed, x, x_seq_ids, lm_mask_bool
 
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_labels = self._create_bow(
+        all_label_ids = self.tf_label_data["label_ids"][0]
+        x = self._create_bow(
             self.tf_label_data["label_features"],
             self.tf_label_data["label_mask"][0],
-            "label",
+            self.label_name,
         )
-        all_labels_embed = self._tf_layers["embed.label"](all_labels)
+        all_labels_embed = self._tf_layers["embed.label"](x)
 
-        return all_labels, all_labels_embed
+        return all_label_ids, all_labels_embed
 
     @staticmethod
     def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
@@ -1151,7 +1209,12 @@ def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
         return tf.gather_nd(x, idxs)
 
     def _mask_loss(
-        self, a_transformed: tf.Tensor, a: tf.Tensor, lm_mask_bool: tf.Tensor
+        self,
+        a_transformed: tf.Tensor,
+        a: tf.Tensor,
+        a_seq_ids: tf.Tensor,
+        lm_mask_bool: tf.Tensor,
+        name: Text,
     ) -> tf.Tensor:
         # make sure there is at least one element in the mask
         lm_mask_bool = tf.cond(
@@ -1163,22 +1226,29 @@ def _mask_loss(
         lm_mask_bool = tf.squeeze(lm_mask_bool, -1)
         a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
         a_masked = tf.boolean_mask(a, lm_mask_bool)
+        a_masked_ids = tf.boolean_mask(a_seq_ids, lm_mask_bool)
 
-        a_t_masked_embed = self._tf_layers["embed.lm_mask"](a_t_masked)
-        a_masked_embed = self._tf_layers["embed.golden_token"](a_masked)
+        a_t_masked_embed = self._tf_layers[f"embed.{name}_lm_mask"](a_t_masked)
+        a_masked_embed = self._tf_layers[f"embed.{name}_golden_token"](a_masked)
 
-        return self._tf_layers["loss.mask"](
-            a_t_masked_embed, a_masked_embed, a_masked, a_masked_embed, a_masked
+        return self._tf_layers[f"loss.{name}_mask"](
+            a_t_masked_embed,
+            a_masked_embed,
+            a_masked_ids,
+            a_masked_embed,
+            a_masked_ids,
         )
 
-    def _intent_loss(self, a: tf.Tensor, b: tf.Tensor) -> tf.Tensor:
-        all_labels, all_labels_embed = self._create_all_labels()
+    def _label_loss(
+        self, a: tf.Tensor, b: tf.Tensor, label_ids: tf.Tensor
+    ) -> tf.Tensor:
+        all_label_ids, all_labels_embed = self._create_all_labels()
 
         a_embed = self._tf_layers["embed.text"](a)
         b_embed = self._tf_layers["embed.label"](b)
 
         return self._tf_layers["loss.label"](
-            a_embed, b_embed, b, all_labels_embed, all_labels
+            a_embed, b_embed, label_ids, all_labels_embed, all_label_ids
         )
 
     def _entity_loss(
@@ -1216,14 +1286,25 @@ def batch_loss(
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = self._get_sequence_lengths(mask_text)
 
-        text_transformed, text_in, lm_mask_bool_text = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, "text", self.config[MASKED_LM]
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data["text_features"],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
         )
 
         losses = []
 
         if self.config[MASKED_LM]:
-            loss, acc = self._mask_loss(text_transformed, text_in, lm_mask_bool_text)
+            loss, acc = self._mask_loss(
+                text_transformed, text_in, text_seq_ids, lm_mask_bool_text, "text"
+            )
             self.mask_loss.update_state(loss)
             self.mask_acc.update_state(acc)
             losses.append(loss)
@@ -1232,10 +1313,13 @@ def batch_loss(
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
 
+            label_ids = tf_batch_data["label_ids"][0]
             label = self._create_bow(
-                tf_batch_data["label_features"], tf_batch_data["label_mask"][0], "label"
+                tf_batch_data["label_features"],
+                tf_batch_data["label_mask"][0],
+                self.label_name,
             )
-            loss, acc = self._intent_loss(cls, label)
+            loss, acc = self._label_loss(cls, label, label_ids)
             self.intent_loss.update_state(loss)
             self.intent_acc.update_state(acc)
             losses.append(loss)
@@ -1262,8 +1346,8 @@ def batch_predict(
         mask_text = tf_batch_data["text_mask"][0]
         sequence_lengths = self._get_sequence_lengths(mask_text)
 
-        text_transformed, _, _ = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, "text"
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, self.text_name
         )
 
         out = {}
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index eba0bd50afdd..75e30d5df9c2 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -31,7 +31,7 @@
 )
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
-from rasa.nlu.selectors.embedding_response_selector import ResponseSelector
+from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/response_selector.py
similarity index 65%
rename from rasa/nlu/selectors/embedding_response_selector.py
rename to rasa/nlu/selectors/response_selector.py
index 1b20938306a7..1de86b98b2c5 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -1,8 +1,12 @@
 import logging
-from typing import Any, Dict, Text, Optional
+
+import numpy as np
+import tensorflow as tf
+
+from typing import Any, Dict, List, Optional, Text, Tuple, Union
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_TEXT,
@@ -89,16 +93,16 @@ class ResponseSelector(DIETClassifier):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_TEXT: [256, 128],
+        HIDDEN_LAYERS_SIZES_TEXT: [],
         # sizes of hidden layers before the embedding layer for intent labels
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_LABEL: [256, 128],
+        HIDDEN_LAYERS_SIZES_LABEL: [],
         # Whether to share the hidden layer weights between input words and intent labels
         SHARE_HIDDEN_LAYERS: False,
         # number of units in transformer
-        TRANSFORMER_SIZE: 128,
+        TRANSFORMER_SIZE: 256,
         # number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 1,
+        NUM_TRANSFORMER_LAYERS: 2,
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # max sequence length if pos_encoding='emb'
@@ -148,17 +152,20 @@ class ResponseSelector(DIETClassifier):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: True,
+        UNIDIRECTIONAL_ENCODER: False,
+        # if true apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
+        # if true random tokens of the input message will be masked and the model
+        # should predict those tokens
+        MASKED_LM: False,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
-        # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: False,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -172,11 +179,10 @@ def __init__(
     ):
         component_config = component_config or {}
 
-        # the following properties are fixed for the ResponseSelector
+        # the following properties don't exist for the ResponseSelector
         component_config[INTENT_CLASSIFICATION] = True
-        component_config[ENTITY_RECOGNITION] = False
-        component_config[MASKED_LM] = False
-        component_config[BILOU_FLAG] = False
+        component_config[ENTITY_RECOGNITION] = None
+        component_config[BILOU_FLAG] = None
 
         super().__init__(
             component_config,
@@ -186,6 +192,10 @@ def __init__(
             batch_tuple_sizes,
         )
 
+    @staticmethod
+    def model_name():
+        return DIET2DIET
+
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
         if not self.retrieval_intent:
@@ -257,3 +267,116 @@ def process(self, message: Message, **kwargs: Any) -> None:
         prediction_dict = {"response": label, "ranking": label_ranking}
 
         self._set_message_property(message, prediction_dict, selector_key)
+
+
+class DIET2DIET(DIET):
+    def _prepare_layers(self) -> None:
+        self._prepare_sequence_layers(self.text_name)
+        self._prepare_sequence_layers(self.label_name)
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers(self.text_name)
+            self._prepare_mask_lm_layers(self.label_name)
+        self._prepare_label_classification_layers()
+
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label_ids = self.tf_label_data["label_ids"][0]
+
+        mask_label = self.tf_label_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            self.tf_label_data["label_features"], mask_label, self.label_name,
+        )
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+
+        all_labels_embed = self._tf_layers["embed.label"](cls_label)
+
+        return all_label_ids, all_labels_embed
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data["text_features"],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
+        )
+
+        mask_label = tf_batch_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["label_features"], mask_label, self.label_name,
+        )
+
+        losses = []
+
+        if self.config[MASKED_LM]:
+            loss, acc = self._mask_loss(
+                text_transformed,
+                text_in,
+                text_seq_ids,
+                lm_mask_bool_text,
+                self.text_name,
+            )
+
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
+            losses.append(loss)
+
+        # get _cls_ vector for label classification
+        cls_text = self._last_token(text_transformed, sequence_lengths_text)
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+        label_ids = tf_batch_data["label_ids"][0]
+
+        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
+        self.intent_loss.update_state(loss)
+        self.intent_acc.update_state(acc)
+        losses.append(loss)
+
+        return tf.math.add_n(losses)
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
+        )
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, self.text_name
+        )
+
+        out = {}
+
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels()
+
+        # get _cls_ vector for intent classification
+        cls = self._last_token(text_transformed, sequence_lengths_text)
+        cls_embed = self._tf_layers["embed.text"](cls)
+
+        sim_all = self._tf_layers["loss.label"].sim(
+            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
+        )
+        scores = self._tf_layers["loss.label"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+        out["i_scores"] = scores
+
+        return out
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 61f9d0eda836..c157e32d1100 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -29,10 +29,13 @@ def dropped_inputs():
 class DenseForSparse(tf.keras.layers.Dense):
     """Dense layer for sparse input tensor"""
 
-    def __init__(self, reg_lambda: float, **kwargs) -> None:
-        l1_regularizer = tf.keras.regularizers.l1(reg_lambda)
+    def __init__(self, reg_lambda: float = 0, **kwargs) -> None:
+        if reg_lambda > 0:
+            regularizer = tf.keras.regularizers.l1(reg_lambda)
+        else:
+            regularizer = None
 
-        super().__init__(kernel_regularizer=l1_regularizer, **kwargs)
+        super().__init__(kernel_regularizer=regularizer, **kwargs)
 
     def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         if not isinstance(inputs, tf.SparseTensor):
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index ffd804dc1e79..d0e0bb08a368 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -499,7 +499,7 @@ def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
 
         if array_of_dense[0].ndim < 2:
             # data doesn't contain a sequence
-            return array_of_dense
+            return array_of_dense.astype(np.float32)
 
         data_size = len(array_of_dense)
         max_seq_len = max([x.shape[0] for x in array_of_dense])
diff --git a/rasa/utils/tensorflow/tf_models.py b/rasa/utils/tensorflow/tf_models.py
index f1ad5694ae35..90c1296c5f03 100644
--- a/rasa/utils/tensorflow/tf_models.py
+++ b/rasa/utils/tensorflow/tf_models.py
@@ -320,7 +320,11 @@ def batch_to_model_data_format(
                     )
                     idx += 3
                 else:
-                    batch_data[k].append(batch[idx])
+                    if isinstance(batch[idx], tf.Tensor):
+                        batch_data[k].append(batch[idx])
+                    else:
+                        # convert to Tensor
+                        batch_data[k].append(tf.constant(batch[idx], dtype=tf.float32))
                     idx += 1
 
         return batch_data
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 2c5a76a8c217..87a7951f2626 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -48,7 +48,7 @@
 from rasa.nlu import training_data, config
 from tests.nlu import utilities
 from tests.nlu.conftest import DEFAULT_DATA_PATH, NLU_DEFAULT_CONFIG_PATH
-from rasa.nlu.selectors.embedding_response_selector import ResponseSelector
+from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.test import is_response_selector_present
 
 

From a8a0098b7754ebe519ef36feb1c6c12417c102f1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 10 Feb 2020 15:33:50 +0100
Subject: [PATCH 316/633] remove debug statement

---
 rasa/utils/tensorflow/tf_model_data.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index 18664d4c3384..6f99c4a86199 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -183,9 +183,6 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
 
         # skip balancing if labels are token based
         if self.label_key is None or data[self.label_key][0][0].size > 1:
-            logger.debug(
-                f"Skip balancing data for '{self.label_key}' as data is a sequence."
-            )
             return data
 
         label_ids = self._create_label_ids(data[self.label_key][0])

From 2acc84edb8133cb30ca1d7c3294a4e3d3c15d77a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 15:42:40 +0100
Subject: [PATCH 317/633] add label_key to response selector

---
 rasa/nlu/selectors/response_selector.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index c5b9335b923a..04bf92993fdc 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -192,6 +192,7 @@ def __init__(
             model,
             batch_tuple_sizes,
         )
+        self.label_key = "label_ids"
 
     @staticmethod
     def model_name():

From f23a1f02cd3268492e4c863d6f7ac83b55df782e Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 10 Feb 2020 15:43:24 +0100
Subject: [PATCH 318/633] Update rasa/nlu/classifiers/diet_classifier.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 9fd1ac737ff5..b4071c2d4242 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -194,7 +194,7 @@ def _check_config_parameters(self) -> None:
 
         if (
             self.component_config[MASKED_LM]
-            and not self.component_config[NUM_TRANSFORMER_LAYERS]
+            and self.component_config[NUM_TRANSFORMER_LAYERS] == 0
         ):
             raise ValueError(
                 "If number of transformer layers is 0,"

From de836d2e667d6b43cff38adb648f725721f08541 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 10 Feb 2020 15:49:29 +0100
Subject: [PATCH 319/633] Update rasa/nlu/classifiers/diet_classifier.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b4071c2d4242..66b76b1b3392 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -980,7 +980,7 @@ def _update_metrics_to_log(self) -> None:
     def _prepare_layers(self) -> None:
         self._prepare_sequence_layers(self.text_name)
         if self.config[MASKED_LM]:
-            self._prepare_mask_lm_layers("text")
+            self._prepare_mask_lm_layers(self.text_name)
         if self.config[INTENT_CLASSIFICATION]:
             self._prepare_input_layers(self.label_name)
             self._prepare_label_classification_layers()

From e860b0a220ee6a4e0a98aaf8fa66465ae45f03b1 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 15:55:04 +0100
Subject: [PATCH 320/633] fix response metrics

---
 rasa/nlu/classifiers/diet_classifier.py | 14 +++++++-------
 rasa/nlu/selectors/response_selector.py | 20 +++++++++++++++++---
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b4071c2d4242..be86e4b997b0 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -197,8 +197,8 @@ def _check_config_parameters(self) -> None:
             and self.component_config[NUM_TRANSFORMER_LAYERS] == 0
         ):
             raise ValueError(
-                "If number of transformer layers is 0,"
-                "'use_masked_language_model' option should be 'False'."
+                f"If number of transformer layers is 0, "
+                f"'{MASKED_LM}' option should be 'False'."
             )
 
         if (
@@ -207,7 +207,7 @@ def _check_config_parameters(self) -> None:
             != self.component_config[HIDDEN_LAYERS_SIZES_LABEL]
         ):
             raise ValueError(
-                "If hidden layer weights are shared,"
+                "If hidden layer weights are shared, "
                 "hidden_layer_sizes for text and label must coincide."
             )
 
@@ -962,11 +962,11 @@ def _create_metrics(self):
         # self.metrics preserve order
         # output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
-        self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
+        self.response_loss = tf.keras.metrics.Mean(name="i_loss")
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
         # output accuracies second
         self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
-        self.intent_acc = tf.keras.metrics.Mean(name="i_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="i_acc")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
 
     def _update_metrics_to_log(self) -> None:
@@ -1336,8 +1336,8 @@ def batch_loss(
                 self.label_name,
             )
             loss, acc = self._label_loss(cls, label, label_ids)
-            self.intent_loss.update_state(loss)
-            self.intent_acc.update_state(acc)
+            self.response_loss.update_state(loss)
+            self.response_acc.update_state(acc)
             losses.append(loss)
 
         if self.config[ENTITY_RECOGNITION]:
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 04bf92993fdc..ede43bc7a685 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -272,12 +272,26 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
 
 class DIET2DIET(DIET):
+    def _create_metrics(self):
+        # self.metrics preserve order
+        # output losses first
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
+        # output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
+
+    def _update_metrics_to_log(self) -> None:
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+
+        self.metrics_to_log += ["r_loss", "r_acc"]
+
     def _prepare_layers(self) -> None:
         self._prepare_sequence_layers(self.text_name)
         self._prepare_sequence_layers(self.label_name)
         if self.config[MASKED_LM]:
             self._prepare_mask_lm_layers(self.text_name)
-            self._prepare_mask_lm_layers(self.label_name)
         self._prepare_label_classification_layers()
 
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -344,8 +358,8 @@ def batch_loss(
         label_ids = tf_batch_data["label_ids"][0]
 
         loss, acc = self._label_loss(cls_text, cls_label, label_ids)
-        self.intent_loss.update_state(loss)
-        self.intent_acc.update_state(acc)
+        self.response_loss.update_state(loss)
+        self.response_acc.update_state(acc)
         losses.append(loss)
 
         return tf.math.add_n(losses)

From 16aa01f04b865331f4c4344a9428541e5619baf5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 16:06:43 +0100
Subject: [PATCH 321/633] add TED policy with relative attention to tests

---
 rasa/nlu/selectors/embedding_response_selector.py |  3 +++
 tests/core/test_policies.py                       | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index f08f70f36125..15e204fb73ba 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -31,6 +31,7 @@
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
+    DROPRATE_ATTENTION,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -149,6 +150,8 @@ class ResponseSelector(DIETClassifier):
         NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: True,
         # visualization of accuracy
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 5d0a4a2c080b..573080142d44 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -39,6 +39,9 @@
     SCALE_LOSS,
     EVAL_NUM_EXAMPLES,
     EPOCHS,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
 )
 from rasa.utils import train_utils
 from tests.core.conftest import (
@@ -512,6 +515,18 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
+class TestTEDPolicyWithRelativeAttention(TestTEDPolicy):
+    def create_policy(self, featurizer, priority):
+        p = TEDPolicy(
+            featurizer=featurizer,
+            priority=priority,
+            **{KEY_RELATIVE_ATTENTION: True,
+               VALUE_RELATIVE_ATTENTION: True,
+               MAX_RELATIVE_POSITION:5},
+        )
+        return p
+
+
 class TestTEDPolicyWithTfConfig(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
         p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())

From d56b405ca779fe8855df0e13d7661cd9551dc7ec Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 16:19:00 +0100
Subject: [PATCH 322/633] rename model_name to model_class

---
 rasa/nlu/classifiers/diet_classifier.py |  6 +++---
 rasa/nlu/selectors/response_selector.py |  2 +-
 tests/nlu/base/test_persistor.py        | 10 +++++-----
 tests/test_train.py                     | 12 ++++++------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 359232f1f524..ebfbbccbbaea 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -259,7 +259,7 @@ def __init__(
         )
 
     @staticmethod
-    def model_name():
+    def model_class() -> Any:
         return DIET
 
     # training data helpers:
@@ -598,7 +598,7 @@ def train(
         # keep one example for persisting and loading
         self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
-        self.model = self.model_name()(
+        self.model = self.model_class()(
             model_data.get_signature(),
             self._label_data,
             self.inverted_tag_dict,
@@ -877,7 +877,7 @@ def _load_model(
         label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else None
         model_data_example = RasaModelData(label_key=label_key, data=data_example)
 
-        model = cls.model_name().load(
+        model = cls.model_class().load(
             tf_model_file,
             model_data_example,
             data_signature=model_data_example.get_signature(),
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index ede43bc7a685..b99568a94a98 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -195,7 +195,7 @@ def __init__(
         self.label_key = "label_ids"
 
     @staticmethod
-    def model_name():
+    def model_class():
         return DIET2DIET
 
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
diff --git a/tests/nlu/base/test_persistor.py b/tests/nlu/base/test_persistor.py
index 8371060a37bd..e53bc52688ad 100644
--- a/tests/nlu/base/test_persistor.py
+++ b/tests/nlu/base/test_persistor.py
@@ -79,7 +79,7 @@ def test_list_models_method_in_GCSPersistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
-            "blob_name": ("project", "model_name")
+            "blob_name": ("project", "model_class")
         }[x]
         self.bucket = Object()
 
@@ -93,7 +93,7 @@ def mocked_list_blobs():
     with patch.object(persistor.GCSPersistor, "__init__", mocked_init):
         result = persistor.GCSPersistor("").list_models()
 
-    assert result == ["model_name"]
+    assert result == ["model_class"]
 
 
 # noinspection PyPep8Naming
@@ -101,7 +101,7 @@ def test_list_models_method_raise_exeception_in_GCSPersistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
-            "blob_name": ("project", "model_name")
+            "blob_name": ("project", "model_class")
         }[x]
         self.bucket = Object()
 
@@ -121,7 +121,7 @@ def test_list_models_method_in_AzurePersistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
-            "blob_name": ("project", "model_name")
+            "blob_name": ("project", "model_class")
         }[x]
         self.blob_client = Object()
         self.container_name = "test"
@@ -137,7 +137,7 @@ def mocked_list_blobs(container_name, prefix=None):
     with patch.object(persistor.AzurePersistor, "__init__", mocked_init):
         result = persistor.AzurePersistor("", "", "").list_models()
 
-    assert result == ["model_name"]
+    assert result == ["model_class"]
 
 
 # noinspection PyPep8Naming
diff --git a/tests/test_train.py b/tests/test_train.py
index af9dc02412f5..1fd0cb51b592 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -16,9 +16,9 @@
 @pytest.mark.parametrize(
     "parameters",
     [
-        {"model_name": "test-1234", "prefix": None},
-        {"model_name": None, "prefix": "core-"},
-        {"model_name": None, "prefix": None},
+        {"model_class": "test-1234", "prefix": None},
+        {"model_class": None, "prefix": "core-"},
+        {"model_class": None, "prefix": None},
     ],
 )
 def test_package_model(trained_rasa_model, parameters):
@@ -29,7 +29,7 @@ def test_package_model(trained_rasa_model, parameters):
         _fingerprint(),
         output_path,
         train_path,
-        parameters["model_name"],
+        parameters["model_class"],
         parameters["prefix"],
     )
 
@@ -37,8 +37,8 @@ def test_package_model(trained_rasa_model, parameters):
 
     file_name = os.path.basename(model_path)
 
-    if parameters["model_name"]:
-        assert parameters["model_name"] in file_name
+    if parameters["model_class"]:
+        assert parameters["model_class"] in file_name
 
     if parameters["prefix"]:
         assert parameters["prefix"] in file_name

From 40c31d49a96936faa1026f5e4ee48e16d4ed7dc8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 16:24:47 +0100
Subject: [PATCH 323/633] black

---
 tests/core/test_policies.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 573080142d44..d39a6e741a77 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -520,9 +520,11 @@ def create_policy(self, featurizer, priority):
         p = TEDPolicy(
             featurizer=featurizer,
             priority=priority,
-            **{KEY_RELATIVE_ATTENTION: True,
-               VALUE_RELATIVE_ATTENTION: True,
-               MAX_RELATIVE_POSITION:5},
+            **{
+                KEY_RELATIVE_ATTENTION: True,
+                VALUE_RELATIVE_ATTENTION: True,
+                MAX_RELATIVE_POSITION: 5,
+            },
         )
         return p
 

From 80ce519cb343031318b80606ecee2b8deed5e4a0 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 16:32:04 +0100
Subject: [PATCH 324/633] rename model_class to model_name in tests back

---
 tests/nlu/base/test_persistor.py | 10 +++++-----
 tests/test_train.py              | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/nlu/base/test_persistor.py b/tests/nlu/base/test_persistor.py
index e53bc52688ad..8371060a37bd 100644
--- a/tests/nlu/base/test_persistor.py
+++ b/tests/nlu/base/test_persistor.py
@@ -79,7 +79,7 @@ def test_list_models_method_in_GCSPersistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
-            "blob_name": ("project", "model_class")
+            "blob_name": ("project", "model_name")
         }[x]
         self.bucket = Object()
 
@@ -93,7 +93,7 @@ def mocked_list_blobs():
     with patch.object(persistor.GCSPersistor, "__init__", mocked_init):
         result = persistor.GCSPersistor("").list_models()
 
-    assert result == ["model_class"]
+    assert result == ["model_name"]
 
 
 # noinspection PyPep8Naming
@@ -101,7 +101,7 @@ def test_list_models_method_raise_exeception_in_GCSPersistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
-            "blob_name": ("project", "model_class")
+            "blob_name": ("project", "model_name")
         }[x]
         self.bucket = Object()
 
@@ -121,7 +121,7 @@ def test_list_models_method_in_AzurePersistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
-            "blob_name": ("project", "model_class")
+            "blob_name": ("project", "model_name")
         }[x]
         self.blob_client = Object()
         self.container_name = "test"
@@ -137,7 +137,7 @@ def mocked_list_blobs(container_name, prefix=None):
     with patch.object(persistor.AzurePersistor, "__init__", mocked_init):
         result = persistor.AzurePersistor("", "", "").list_models()
 
-    assert result == ["model_class"]
+    assert result == ["model_name"]
 
 
 # noinspection PyPep8Naming
diff --git a/tests/test_train.py b/tests/test_train.py
index 1fd0cb51b592..af9dc02412f5 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -16,9 +16,9 @@
 @pytest.mark.parametrize(
     "parameters",
     [
-        {"model_class": "test-1234", "prefix": None},
-        {"model_class": None, "prefix": "core-"},
-        {"model_class": None, "prefix": None},
+        {"model_name": "test-1234", "prefix": None},
+        {"model_name": None, "prefix": "core-"},
+        {"model_name": None, "prefix": None},
     ],
 )
 def test_package_model(trained_rasa_model, parameters):
@@ -29,7 +29,7 @@ def test_package_model(trained_rasa_model, parameters):
         _fingerprint(),
         output_path,
         train_path,
-        parameters["model_class"],
+        parameters["model_name"],
         parameters["prefix"],
     )
 
@@ -37,8 +37,8 @@ def test_package_model(trained_rasa_model, parameters):
 
     file_name = os.path.basename(model_path)
 
-    if parameters["model_class"]:
-        assert parameters["model_class"] in file_name
+    if parameters["model_name"]:
+        assert parameters["model_name"] in file_name
 
     if parameters["prefix"]:
         assert parameters["prefix"] in file_name

From 7f2070cc02a59ab1f2454a5d7f9493011adffdec Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 10 Feb 2020 17:05:01 +0100
Subject: [PATCH 325/633] fix random seed test

---
 tests/nlu/training/test_train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index ceb6ce52d3f5..a6f3e9b46fd6 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -99,7 +99,7 @@ async def test_random_seed(component_builder, tmpdir):
 
     _config = utilities.base_test_conf("supervised_embeddings")
     # set fixed random seed of the embedding intent classifier to 1
-    _config.set_component_attr(6, random_seed=1)
+    _config.set_component_attr(5, random_seed=1)
 
     # first run
     (trained_a, _, persisted_path_a) = await train(

From e0e3aab602481343e1371a4e3d76c33978822051 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 19:39:22 +0100
Subject: [PATCH 326/633] fix wrong renaming

---
 rasa/nlu/classifiers/diet_classifier.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index ebfbbccbbaea..2ec3543b31f1 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -962,7 +962,7 @@ def _create_metrics(self):
         # self.metrics preserve order
         # output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
-        self.response_loss = tf.keras.metrics.Mean(name="i_loss")
+        self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
         # output accuracies second
         self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
@@ -1020,8 +1020,8 @@ def _prepare_sparse_dense_layers(
     def _prepare_input_layers(self, name: Text) -> None:
         if f"{name}_features" not in self.data_signature:
             raise KeyError(
-                f"Features for {name} are not present "
-                f"in data signature {self.data_signature}"
+                f"Features for '{name}' are not present "
+                f"in data signature: {self.data_signature}."
             )
 
         self._tf_layers[f"sparse_dropout.{name}"] = tf_layers.SparseDropout(
@@ -1336,7 +1336,7 @@ def batch_loss(
                 self.label_name,
             )
             loss, acc = self._label_loss(cls, label, label_ids)
-            self.response_loss.update_state(loss)
+            self.intent_loss.update_state(loss)
             self.response_acc.update_state(acc)
             losses.append(loss)
 

From 8d7a10bd9cc240fd49aec808fd6afe4e34261e2b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 20:03:44 +0100
Subject: [PATCH 327/633] add EmbeddingPolicy to registry

---
 rasa/core/policies/registry.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rasa/core/policies/registry.py b/rasa/core/policies/registry.py
index 350ed735742a..e219424bd5f4 100644
--- a/rasa/core/policies/registry.py
+++ b/rasa/core/policies/registry.py
@@ -4,6 +4,9 @@
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.ted_policy import TEDPolicy
 
+# noinspection PyUnresolvedReferences
+from rasa.core.policies.embedding_policy import EmbeddingPolicy
+
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.fallback import FallbackPolicy
 

From 615c7d553fdc36c72c1b5ce8028aaa15992f713d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 20:12:55 +0100
Subject: [PATCH 328/633] fix wrong imports

---
 rasa/core/policies/embedding_policy.py              | 12 ++++++------
 rasa/core/policies/ted_policy.py                    |  2 +-
 rasa/nlu/classifiers/embedding_intent_classifier.py | 13 ++++++-------
 rasa/nlu/extractors/crf_entity_extractor.py         | 12 +++++-------
 rasa/utils/train_utils.py                           |  4 ++--
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 4388ea31815c..c780097dac3c 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -18,8 +18,8 @@
     NUM_NEG,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    C_EMB,
-    C2,
+    NEG_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_SIM_NEG,
     MU_NEG,
@@ -33,7 +33,7 @@
     DROPRATE_LABEL,
 )
 from rasa.utils.common import raise_warning
-from rasa.utils.tensorflow.tf_models import RasaModel
+from rasa.utils.tensorflow.models import RasaModel
 
 logger = logging.getLogger(__name__)
 
@@ -94,11 +94,11 @@ class EmbeddingPolicy(TEDPolicy):
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization
-        # the scale of L2 regularization
-        C2: 0.001,
+        # the scale of regularization
+        REGULARIZATION_CONSTANT: 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different labels
-        C_EMB: 0.8,
+        NEG_MARGIN_SCALE: 0.8,
         # dropout rate for dial nn
         DROPRATE_DIALOGUE: 0.1,
         # dropout rate for bot nn
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 8135388e2e34..cef7f155f869 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -119,7 +119,7 @@ class TEDPolicy(Policy):
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization
-        # the scale of L2 regularization
+        # the scale of regularization
         REGULARIZATION_CONSTANT: 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different labels
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4911d2d8aeac..93ec9b76d4af 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -6,7 +6,6 @@
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    ENTITIES_ATTRIBUTE,
     DENSE_FEATURE_NAMES,
     SPARSE_FEATURE_NAMES,
 )
@@ -32,8 +31,8 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     DROPRATE,
-    C_EMB,
-    C2,
+    NEG_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_SIM_NEG,
     MU_NEG,
@@ -42,7 +41,7 @@
     BILOU_FLAG,
 )
 from rasa.utils.common import raise_warning
-from rasa.utils.tensorflow.tf_models import RasaModel
+from rasa.utils.tensorflow.models import RasaModel
 
 logger = logging.getLogger(__name__)
 
@@ -104,11 +103,11 @@ class EmbeddingIntentClassifier(DIETClassifier):
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization parameters
-        # the scale of L2 regularization
-        C2: 0.002,
+        # the scale of regularization
+        REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different labels
-        C_EMB: 0.8,
+        NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
         # if true apply dropout to sparse tensors
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 5f865e4c1321..0dd8bff8e24d 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -10,13 +10,10 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.constants import DOCS_BASE_URL
-from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
     ENTITIES_ATTRIBUTE,
-    DENSE_FEATURE_NAMES,
-    SPARSE_FEATURE_NAMES,
     TOKENS_NAMES,
 )
 from rasa.utils.tensorflow.constants import (
@@ -37,11 +34,11 @@
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
-    C2,
+    REGULARIZATION_CONSTANT,
     BILOU_FLAG,
 )
 from rasa.utils.common import raise_warning
-from rasa.utils.tensorflow.tf_models import RasaModel
+from rasa.utils.tensorflow.models import RasaModel
 
 logger = logging.getLogger(__name__)
 
@@ -96,8 +93,8 @@ class CRFEntityExtractor(DIETClassifier):
         # default dense dimension used if no dense features are present
         DENSE_DIM: {"text": 512, "label": 20},
         # regularization parameters
-        # the scale of L2 regularization
-        C2: 0.002,
+        # the scale of regularization
+        REGULARIZATION_CONSTANT: 0.002,
         # dropout rate for rnn
         DROPRATE: 0.2,
         # if true apply dropout to sparse tensors
@@ -131,6 +128,7 @@ def __init__(
         component_config[MASKED_LM] = False
         component_config[NUM_TRANSFORMER_LAYERS] = 0
         component_config[SHARE_HIDDEN_LAYERS] = False
+        component_config[UNIDIRECTIONAL_ENCODER] = False
 
         super().__init__(
             component_config,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9152e17beb6d..e22f366c98f9 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -15,7 +15,7 @@
     NUM_NEG,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    C2,
+    REGULARIZATION_CONSTANT,
     USE_MAX_SIM_NEG,
     MU_NEG,
     MU_POS,
@@ -104,7 +104,7 @@ def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
     config = _replace_deprecated_option("mu_pos", MU_POS, config)
     config = _replace_deprecated_option("mu_neg", MU_NEG, config)
     config = _replace_deprecated_option("use_max_sim_neg", USE_MAX_SIM_NEG, config)
-    config = _replace_deprecated_option("C2", C2, config)
+    config = _replace_deprecated_option("C2", REGULARIZATION_CONSTANT, config)
     config = _replace_deprecated_option(
         "evaluate_every_num_epochs", EVAL_NUM_EPOCHS, config
     )

From 46387e1bdd4f3a16925704a2791dd2fe9b581e20 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 10 Feb 2020 20:21:06 +0100
Subject: [PATCH 329/633] use get to change similarity type

---
 rasa/utils/train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9152e17beb6d..2ce1bd0ef39e 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -55,7 +55,7 @@ def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarr
 
 
 def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
-    if config[SIMILARITY_TYPE] == "auto":
+    if config.get(SIMILARITY_TYPE) == "auto":
         if config[LOSS_TYPE] == "softmax":
             config[SIMILARITY_TYPE] = "inner"
         elif config[LOSS_TYPE] == "margin":

From 06094093e71db09f466a8f7810ddf0b6d68859b2 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 00:46:06 +0100
Subject: [PATCH 330/633] fix response featurization

---
 rasa/nlu/featurizers/featurizer.py                   |  3 +++
 .../sparse_featurizer/count_vectors_featurizer.py    | 12 ++++++++----
 .../sparse_featurizer/regex_featurizer.py            |  8 ++++++--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index 20be3bde66e9..3dd94dce44bf 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -53,6 +53,9 @@ def _combine_with_existing_sparse_features(
         additional_features: Any,
         feature_name: Text = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
     ) -> Any:
+        if additional_features is None:
+            return
+
         if message.get(feature_name) is not None:
             from scipy.sparse import hstack
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index d83de943a457..712ca70bfd18 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -269,9 +269,9 @@ def _get_processed_message_tokens_by_attribute(
         """Get processed text of attribute of a message"""
 
         if message.get(attribute) is None:
-            # return empty string since sklearn countvectorizer does not like None
+            # return empty list since sklearn countvectorizer does not like None
             # object while training and predicting
-            return [""]
+            return []
 
         tokens = self._get_message_tokens_by_attribute(message, attribute)
         tokens = self._process_tokens(tokens, attribute)
@@ -404,10 +404,14 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
 
     def _create_sequence(
         self, attribute: Text, all_tokens: List[List[Text]]
-    ) -> List[scipy.sparse.coo_matrix]:
+    ) -> List[Optional[scipy.sparse.coo_matrix]]:
         X = []
 
         for i, tokens in enumerate(all_tokens):
+            if not tokens:
+                # nothing to featurize
+                X.append(None)
+
             # vectorizer.transform returns a sparse matrix of size
             # [n_samples, n_features]
             # set input to list of tokens if sequence should be returned
@@ -434,7 +438,7 @@ def _create_sequence(
 
     def _get_featurized_attribute(
         self, attribute: Text, all_tokens: List[List[Text]]
-    ) -> Optional[List[scipy.sparse.coo_matrix]]:
+    ) -> Optional[List[Optional[scipy.sparse.coo_matrix]]]:
         """Return features of a particular attribute for complete data"""
 
         if self._check_attribute_vocabulary(attribute):
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 2f11cce22487..4012a8bbc9e0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -82,14 +82,18 @@ def _add_lookup_table_regexes(
 
     def _features_for_patterns(
         self, message: Message, attribute: Text
-    ) -> scipy.sparse.coo_matrix:
+    ) -> Optional[scipy.sparse.coo_matrix]:
         """Checks which known patterns match the message.
 
         Given a sentence, returns a vector of {1,0} values indicating which
         regexes did match. Furthermore, if the
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
-        tokens = message.get(TOKENS_NAMES[attribute], [])
+        tokens = message.get(TOKENS_NAMES[attribute])
+        if not tokens:
+            # nothing to featurize
+            return
+
         seq_length = len(tokens)
 
         vec = np.zeros([seq_length, len(self.known_patterns)])

From 4c536b5dc1b09c6ed5f8b4fcf587aeae7996fbcf Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 01:02:24 +0100
Subject: [PATCH 331/633] continue for loop

---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 712ca70bfd18..3b9062006529 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -411,6 +411,7 @@ def _create_sequence(
             if not tokens:
                 # nothing to featurize
                 X.append(None)
+                continue
 
             # vectorizer.transform returns a sparse matrix of size
             # [n_samples, n_features]

From de52219790cd8358d27e72f637d39bce2aead567 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 11 Feb 2020 08:14:48 +0100
Subject: [PATCH 332/633] fix cloudpickle version

---
 requirements.txt | 2 ++
 setup.py         | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index f58d5afa0ddb..dae333d3e3ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -39,6 +39,8 @@ terminaltables==3.1.0
 sanic==19.9.0
 sanic-cors==0.9.9.post1
 sanic-jwt==1.3.2
+# needed because of https://github.com/RasaHQ/rasa/issues/5216
+cloudpickle==1.2.2
 # https://github.com/RasaHQ/rasa/pull/5064
 sanic-plugins-framework==0.8.2
 # needed because of https://github.com/huge-success/sanic/issues/1729
diff --git a/setup.py b/setup.py
index 264e63bf31ce..87d6f07fc627 100644
--- a/setup.py
+++ b/setup.py
@@ -69,6 +69,8 @@
     "sanic~=19.9.0",
     "sanic-cors==0.9.9.post1",
     "sanic-jwt~=1.3",
+    # needed because of https://github.com/RasaHQ/rasa/issues/5216
+    "cloudpickle==1.2.2",
     # needed because of https://github.com/huge-success/sanic/issues/1729
     "multidict==4.6.1",
     "aiohttp~=3.5",

From 00a3835e09577117c85d00f6217e59c492cc69df Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 11 Feb 2020 08:15:40 +0100
Subject: [PATCH 333/633] add embeddingpolicy to registry

---
 rasa/core/policies/registry.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rasa/core/policies/registry.py b/rasa/core/policies/registry.py
index 350ed735742a..39f6e4fc5a4b 100644
--- a/rasa/core/policies/registry.py
+++ b/rasa/core/policies/registry.py
@@ -24,3 +24,6 @@
 
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.mapping_policy import MappingPolicy
+
+# noinspection PyUnresolvedReferences
+from rasa.core.policies.embedding_policy import EmbeddingPolicy

From f566d561cbab145da2dee31af2489a5baeaefd41 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 09:48:18 +0100
Subject: [PATCH 334/633] fix different config options

---
 rasa/core/constants.py                        |  2 +
 rasa/core/policies/embedding_policy.py        | 13 ++--
 rasa/core/policies/ted_policy.py              | 35 ++++-----
 rasa/nlu/classifiers/diet_classifier.py       | 76 ++++++++-----------
 .../embedding_intent_classifier.py            | 23 ++----
 .../classifiers/keyword_intent_classifier.py  | 14 ++--
 .../classifiers/mitie_intent_classifier.py    | 10 +--
 .../classifiers/sklearn_intent_classifier.py  |  8 +-
 rasa/nlu/components.py                        |  4 +-
 rasa/nlu/constants.py                         | 36 ++++-----
 rasa/nlu/extractors/__init__.py               | 10 +--
 rasa/nlu/extractors/crf_entity_extractor.py   | 20 ++---
 .../nlu/extractors/duckling_http_extractor.py |  8 +-
 rasa/nlu/extractors/entity_synonyms.py        | 10 +--
 rasa/nlu/extractors/mitie_entity_extractor.py | 14 ++--
 rasa/nlu/extractors/spacy_entity_extractor.py |  8 +-
 .../dense_featurizer/convert_featurizer.py    | 12 +--
 .../dense_featurizer/mitie_featurizer.py      |  8 +-
 .../dense_featurizer/spacy_featurizer.py      |  4 +-
 rasa/nlu/featurizers/featurizer.py            |  6 +-
 .../count_vectors_featurizer.py               | 26 +++----
 .../lexical_syntactic_featurizer.py           | 14 ++--
 .../sparse_featurizer/regex_featurizer.py     | 14 ++--
 rasa/nlu/selectors/response_selector.py       | 42 +++++-----
 rasa/nlu/test.py                              |  4 +-
 rasa/nlu/tokenizers/tokenizer.py              | 18 ++---
 rasa/nlu/training_data/formats/markdown.py    |  4 +-
 rasa/nlu/training_data/message.py             | 22 +++---
 rasa/nlu/training_data/training_data.py       | 10 +--
 rasa/nlu/training_data/util.py                | 10 +--
 rasa/nlu/utils/bilou_utils.py                 | 26 +++----
 rasa/nlu/utils/spacy_utils.py                 |  4 +-
 rasa/utils/tensorflow/constants.py            |  5 +-
 rasa/utils/train_utils.py                     | 39 ++++++----
 tests/nlu/base/test_training_data.py          |  6 +-
 tests/nlu/classifiers/test_diet_classifier.py | 18 ++---
 .../extractors/test_crf_entity_extractor.py   |  6 +-
 .../featurizers/test_convert_featurizer.py    | 32 ++++----
 .../test_count_vectors_featurizer.py          | 72 ++++++++----------
 tests/nlu/featurizers/test_featurizer.py      | 10 +--
 .../test_lexical_syntactic_featurizer.py      | 18 ++---
 .../nlu/featurizers/test_mitie_featurizer.py  | 22 +++---
 .../nlu/featurizers/test_regex_featurizer.py  | 36 ++++-----
 .../nlu/featurizers/test_spacy_featurizer.py  | 24 +++---
 .../nlu/tokenizers/test_convert_tokenizer.py  | 10 +--
 tests/nlu/tokenizers/test_jieba_tokenizer.py  | 12 ++-
 tests/nlu/tokenizers/test_mitie_tokenizer.py  | 10 +--
 tests/nlu/tokenizers/test_spacy_tokenizer.py  | 30 ++++----
 tests/nlu/tokenizers/test_tokenizer.py        | 18 ++---
 .../tokenizers/test_whitespace_tokenizer.py   | 12 ++-
 tests/nlu/utils/test_bilou_utils.py           | 18 ++---
 51 files changed, 425 insertions(+), 488 deletions(-)

diff --git a/rasa/core/constants.py b/rasa/core/constants.py
index 02188d8aa8c0..75441562de6d 100644
--- a/rasa/core/constants.py
+++ b/rasa/core/constants.py
@@ -56,3 +56,5 @@
 FORM_POLICY_PRIORITY = 5
 UTTER_PREFIX = "utter_"
 RESPOND_PREFIX = "respond_"
+
+DIALOGUE = "dialogue"
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 4388ea31815c..881ae0d0e1ba 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1,12 +1,13 @@
 import logging
 from typing import Any, Dict, Optional, Text
 
-from rasa.core.constants import DEFAULT_POLICY_PRIORITY
+from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.ted_policy import TEDPolicy
 from rasa.constants import DOCS_BASE_URL
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_LABEL,
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
     BATCH_SIZES,
     BATCH_STRATEGY,
@@ -25,7 +26,6 @@
     MU_NEG,
     MU_POS,
     EMBED_DIM,
-    HIDDEN_LAYERS_SIZES_DIALOGUE,
     TRANSFORMER_SIZE,
     MAX_SEQ_LENGTH,
     NUM_HEADS,
@@ -47,12 +47,9 @@ class EmbeddingPolicy(TEDPolicy):
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # a list of hidden layers sizes before user embed layer
+        # a list of hidden layers sizes before dialogue and action embed layers
         # number of hidden layers is equal to the length of this list
-        HIDDEN_LAYERS_SIZES_DIALOGUE: [],
-        # a list of hidden layers sizes before bot embed layer
-        # number of hidden layers is equal to the length of this list
-        HIDDEN_LAYERS_SIZES_LABEL: [],
+        HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
         # number of units in transformer
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index dce22bd80109..225acaf91fae 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -18,14 +18,15 @@
     MaxHistoryTrackerFeaturizer,
 )
 from rasa.core.policies.policy import Policy
-from rasa.core.constants import DEFAULT_POLICY_PRIORITY
+from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
 from rasa.core.trackers import DialogueStateTracker
 from rasa.utils import train_utils
 from rasa.utils.tensorflow import tf_layers
 from rasa.utils.tensorflow.tf_models import RasaModel
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_LABEL,
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
@@ -47,7 +48,6 @@
     MU_NEG,
     MU_POS,
     EMBED_DIM,
-    HIDDEN_LAYERS_SIZES_DIALOGUE,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
 )
@@ -67,12 +67,9 @@ class TEDPolicy(Policy):
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # a list of hidden layers sizes before user embed layer
+        # a list of hidden layers sizes before dialogue and action embed layers
         # number of hidden layers is equal to the length of this list
-        HIDDEN_LAYERS_SIZES_DIALOGUE: [],
-        # a list of hidden layers sizes before bot embed layer
-        # number of hidden layers is equal to the length of this list
-        HIDDEN_LAYERS_SIZES_LABEL: [],
+        HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
         # number of units in transformer
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
@@ -462,8 +459,8 @@ def __init__(
         )
 
         # metrics
-        self.metric_loss = tf.keras.metrics.Mean(name="loss")
-        self.metric_acc = tf.keras.metrics.Mean(name="acc")
+        self.action_loss = tf.keras.metrics.Mean(name="loss")
+        self.action_acc = tf.keras.metrics.Mean(name="acc")
         self.metrics_to_log += ["loss", "acc"]
 
         # set up tf layers
@@ -483,16 +480,16 @@ def _prepare_layers(self) -> None:
             parallel_iterations=1 if self.random_seed is not None else 1000,
         )
         self._tf_layers["ffnn.dialogue"] = tf_layers.Ffnn(
-            self.config[HIDDEN_LAYERS_SIZES_DIALOGUE],
+            self.config[HIDDEN_LAYERS_SIZES][DIALOGUE],
             self.config[DROPRATE_DIALOGUE],
             self.config[C2],
-            layer_name_suffix="dialogue",
+            layer_name_suffix=DIALOGUE,
         )
         self._tf_layers["ffnn.label"] = tf_layers.Ffnn(
-            self.config[HIDDEN_LAYERS_SIZES_LABEL],
+            self.config[HIDDEN_LAYERS_SIZES][LABEL],
             self.config[DROPRATE_LABEL],
             self.config[C2],
-            layer_name_suffix="label",
+            layer_name_suffix=LABEL,
         )
         self._tf_layers["transformer"] = tf_layers.TransformerEncoder(
             self.config[NUM_TRANSFORMER_LAYERS],
@@ -504,18 +501,18 @@ def _prepare_layers(self) -> None:
             dropout_rate=self.config[DROPRATE_DIALOGUE],
             attention_dropout_rate=0,
             unidirectional=True,
-            name="dialogue_encoder",
+            name=DIALOGUE + "_encoder",
         )
         self._tf_layers["embed.dialogue"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "dialogue",
+            DIALOGUE,
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.label"] = tf_layers.Embed(
             self.config[EMBED_DIM],
             self.config[C2],
-            "label",
+            LABEL,
             self.config[SIMILARITY_TYPE],
         )
 
@@ -572,8 +569,8 @@ def batch_loss(
             dialogue_embed, label_embed, label_in, all_labels_embed, all_labels, mask
         )
 
-        self.metric_loss.update_state(loss)
-        self.metric_acc.update_state(acc)
+        self.action_loss.update_state(loss)
+        self.action_acc.update_state(acc)
 
         return loss
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 2ec3543b31f1..e62e883db901 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -22,9 +22,9 @@
 from rasa.utils.tensorflow.tf_models import RasaModel
 from rasa.utils.tensorflow.tf_model_data import RasaModelData, FeatureSignature
 from rasa.nlu.constants import (
-    INTENT_ATTRIBUTE,
-    TEXT_ATTRIBUTE,
-    ENTITIES_ATTRIBUTE,
+    INTENT,
+    TEXT,
+    ENTITIES,
     SPARSE_FEATURE_NAMES,
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
@@ -34,8 +34,8 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_TEXT,
-    HIDDEN_LAYERS_SIZES_LABEL,
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
@@ -94,21 +94,15 @@ class DIETClassifier(EntityExtractor):
 
     provides = ["intent", "intent_ranking", "entities"]
 
-    requires = [
-        any_of(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        )
-    ]
+    requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # sizes of hidden layers before the embedding layer for input words
-        # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_TEXT: [],
-        # sizes of hidden layers before the embedding layer for intent labels
+        # sizes of hidden layers before the embedding layer
+        # for input words and intent labels,
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_LABEL: [],
+        HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
         # Whether to share the hidden layer weights between input words and labels
         SHARE_HIDDEN_LAYERS: False,
         # number of units in transformer
@@ -133,7 +127,7 @@ class DIETClassifier(EntityExtractor):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {"text": 512, "label": 20},
+        DENSE_DIM: {TEXT: 512, LABEL: 20},
         # dimension size of embedding vectors
         EMBED_DIM: 20,
         # the type of the similarity
@@ -201,15 +195,15 @@ def _check_config_parameters(self) -> None:
                 f"'{MASKED_LM}' option should be 'False'."
             )
 
-        if (
-            self.component_config[SHARE_HIDDEN_LAYERS]
-            and self.component_config[HIDDEN_LAYERS_SIZES_TEXT]
-            != self.component_config[HIDDEN_LAYERS_SIZES_LABEL]
-        ):
-            raise ValueError(
-                "If hidden layer weights are shared, "
-                "hidden_layer_sizes for text and label must coincide."
-            )
+        if self.component_config.get(SHARE_HIDDEN_LAYERS):
+            v1 = next(iter(self.component_config[HIDDEN_LAYERS_SIZES].values()))
+            if any(
+                v != v1 for v in self.component_config[HIDDEN_LAYERS_SIZES].values()
+            ):
+                raise ValueError(
+                    f"If hidden layer weights are shared, "
+                    f"{HIDDEN_LAYERS_SIZES} must coincide."
+                )
 
         self.component_config = train_utils.update_similarity_type(
             self.component_config
@@ -286,7 +280,7 @@ def _create_tag_id_dict(self, training_data: TrainingData) -> Dict[Text, int]:
             [
                 e["entity"]
                 for example in training_data.entity_examples
-                for e in example.get(ENTITIES_ATTRIBUTE)
+                for e in example.get(ENTITIES)
             ]
         ) - {None}
 
@@ -353,7 +347,7 @@ def _extract_and_add_features(
         return sparse_features, dense_features
 
     def check_input_dimension_consistency(self, model_data: RasaModelData):
-        if self.component_config[SHARE_HIDDEN_LAYERS]:
+        if self.component_config.get(SHARE_HIDDEN_LAYERS):
             num_text_features = model_data.get_feature_dimension("text_features")
             num_label_features = model_data.get_feature_dimension("label_features")
 
@@ -364,7 +358,7 @@ def check_input_dimension_consistency(self, model_data: RasaModelData):
                 )
 
     def _extract_labels_precomputed_features(
-        self, label_examples: List[Message], attribute: Text = INTENT_ATTRIBUTE
+        self, label_examples: List[Message], attribute: Text = INTENT
     ) -> List[np.ndarray]:
         """Collect precomputed encodings"""
 
@@ -472,7 +466,7 @@ def _create_model_data(
 
         for e in training_data:
             if label_attribute is None or e.get(label_attribute):
-                _sparse, _dense = self._extract_and_add_features(e, TEXT_ATTRIBUTE)
+                _sparse, _dense = self._extract_and_add_features(e, TEXT)
                 if _sparse is not None:
                     X_sparse.append(_sparse)
                 if _dense is not None:
@@ -493,10 +487,8 @@ def _create_model_data(
                     _tags = bilou_utils.tags_to_ids(e, tag_id_dict)
                 else:
                     _tags = []
-                    for t in e.get(TOKENS_NAMES[TEXT_ATTRIBUTE]):
-                        _tag = determine_token_labels(
-                            t, e.get(ENTITIES_ATTRIBUTE), None
-                        )
+                    for t in e.get(TOKENS_NAMES[TEXT]):
+                        _tag = determine_token_labels(t, e.get(ENTITIES), None)
                         _tags.append(tag_id_dict[_tag])
                 # transpose to have seq_len x 1
                 tag_ids.append(np.array([_tags]).T)
@@ -536,20 +528,18 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         if self.component_config[BILOU_FLAG]:
             bilou_utils.apply_bilou_schema(training_data)
 
-        label_id_dict = self._create_label_id_dict(
-            training_data, attribute=INTENT_ATTRIBUTE
-        )
+        label_id_dict = self._create_label_id_dict(training_data, attribute=INTENT)
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
 
         self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=INTENT_ATTRIBUTE
+            training_data, label_id_dict, attribute=INTENT
         )
 
         tag_id_dict = self._create_tag_id_dict(training_data)
         self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         label_attribute = (
-            INTENT_ATTRIBUTE if self.component_config[INTENT_CLASSIFICATION] else None
+            INTENT if self.component_config[INTENT_CLASSIFICATION] else None
         )
 
         model_data = self._create_model_data(
@@ -923,12 +913,6 @@ def __init__(
         self._num_tags = len(inverted_tag_dict) if inverted_tag_dict is not None else 0
 
         self.config = config
-        if self.config[SHARE_HIDDEN_LAYERS]:
-            self.text_name = "text_label"
-            self.label_name = "text_label"
-        else:
-            self.text_name = "text"
-            self.label_name = "label"
 
         # tf objects
         self._tf_layers = {}
@@ -978,10 +962,12 @@ def _update_metrics_to_log(self) -> None:
             self.metrics_to_log += ["e_loss", "e_f1"]
 
     def _prepare_layers(self) -> None:
+        self.text_name = TEXT
         self._prepare_sequence_layers(self.text_name)
         if self.config[MASKED_LM]:
             self._prepare_mask_lm_layers(self.text_name)
         if self.config[INTENT_CLASSIFICATION]:
+            self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
             self._prepare_input_layers(self.label_name)
             self._prepare_label_classification_layers()
         if self.config[ENTITY_RECOGNITION]:
@@ -1034,7 +1020,7 @@ def _prepare_input_layers(self, name: Text) -> None:
             self.config[DENSE_DIM][name],
         )
         self._tf_layers[f"ffnn.{name}"] = tf_layers.Ffnn(
-            self.config[HIDDEN_LAYERS_SIZES_LABEL],
+            self.config[HIDDEN_LAYERS_SIZES][name],
             self.config[DROPRATE],
             self.config[C2],
             name,
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e805a84a0433..3a1b595f70b6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -5,14 +5,13 @@
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
-    ENTITIES_ATTRIBUTE,
+    TEXT,
     DENSE_FEATURE_NAMES,
     SPARSE_FEATURE_NAMES,
 )
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_TEXT,
-    HIDDEN_LAYERS_SIZES_LABEL,
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
     NUM_TRANSFORMER_LAYERS,
     BATCH_SIZES,
@@ -51,21 +50,15 @@ class EmbeddingIntentClassifier(DIETClassifier):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [
-        any_of(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        )
-    ]
+    requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # sizes of hidden layers before the embedding layer for input words
-        # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_TEXT: [256, 128],
-        # sizes of hidden layers before the embedding layer for intent labels
+        # sizes of hidden layers before the embedding layer
+        # for input words and intent labels,
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_LABEL: [],
+        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: []},
         # Whether to share the hidden layer weights between input words and labels
         SHARE_HIDDEN_LAYERS: False,
         # training parameters
@@ -82,7 +75,7 @@ class EmbeddingIntentClassifier(DIETClassifier):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {"text": 512, "label": 20},
+        DENSE_DIM: {TEXT: 512, LABEL: 20},
         # dimension size of embedding vectors
         EMBED_DIM: 20,
         # the type of the similarity
diff --git a/rasa/nlu/classifiers/keyword_intent_classifier.py b/rasa/nlu/classifiers/keyword_intent_classifier.py
index eea75e464442..ab5d5ddba15e 100644
--- a/rasa/nlu/classifiers/keyword_intent_classifier.py
+++ b/rasa/nlu/classifiers/keyword_intent_classifier.py
@@ -7,7 +7,7 @@
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu import utils
 from rasa.nlu.components import Component
-from rasa.nlu.constants import INTENT_ATTRIBUTE
+from rasa.nlu.constants import INTENT
 from rasa.utils.common import raise_warning
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import TrainingData
@@ -26,7 +26,7 @@ class KeywordIntentClassifier(Component):
 
     """
 
-    provides = [INTENT_ATTRIBUTE]
+    provides = [INTENT]
 
     defaults = {"case_sensitive": True}
 
@@ -52,19 +52,19 @@ def train(
         for ex in training_data.training_examples:
             if (
                 ex.text in self.intent_keyword_map.keys()
-                and ex.get(INTENT_ATTRIBUTE) != self.intent_keyword_map[ex.text]
+                and ex.get(INTENT) != self.intent_keyword_map[ex.text]
             ):
                 duplicate_examples.add(ex.text)
                 raise_warning(
                     f"Keyword '{ex.text}' is a keyword to trigger intent "
                     f"'{self.intent_keyword_map[ex.text]}' and also "
-                    f"intent '{ex.get(INTENT_ATTRIBUTE)}', it will be removed "
+                    f"intent '{ex.get(INTENT)}', it will be removed "
                     f"from the list of keywords for both of them. "
                     f"Remove (one of) the duplicates from the training data.",
                     docs=DOCS_URL_COMPONENTS + "#keyword-intent-classifier",
                 )
             else:
-                self.intent_keyword_map[ex.text] = ex.get(INTENT_ATTRIBUTE)
+                self.intent_keyword_map[ex.text] = ex.get(INTENT)
         for keyword in duplicate_examples:
             self.intent_keyword_map.pop(keyword)
             logger.debug(
@@ -107,8 +107,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
         confidence = 0.0 if intent_name is None else 1.0
         intent = {"name": intent_name, "confidence": confidence}
 
-        if message.get(INTENT_ATTRIBUTE) is None or intent is not None:
-            message.set(INTENT_ATTRIBUTE, intent, add_to_output=True)
+        if message.get(INTENT) is None or intent is not None:
+            message.set(INTENT, intent, add_to_output=True)
 
     def _map_keyword_to_intent(self, text: Text) -> Optional[Text]:
         re_flag = 0 if self.case_sensitive else re.IGNORECASE
diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index 2fc76ab294cb..dd05a58f9c77 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -5,7 +5,7 @@
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
-from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, INTENT_ATTRIBUTE
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, INTENT
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
@@ -14,9 +14,9 @@
 
 class MitieIntentClassifier(Component):
 
-    provides = [INTENT_ATTRIBUTE]
+    provides = [INTENT]
 
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE], "mitie_feature_extractor", "mitie_file"]
+    requires = [TOKENS_NAMES[TEXT], "mitie_feature_extractor", "mitie_file"]
 
     def __init__(
         self, component_config: Optional[Dict[Text, Any]] = None, clf=None
@@ -52,7 +52,7 @@ def train(
 
         for example in training_data.intent_examples:
             tokens = self._tokens_of_message(example)
-            trainer.add_labeled_text(tokens, example.get(INTENT_ATTRIBUTE))
+            trainer.add_labeled_text(tokens, example.get(INTENT))
 
         if training_data.intent_examples:
             # we can not call train if there are no examples!
@@ -82,7 +82,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     @staticmethod
     def _tokens_of_message(message) -> List[Text]:
-        tokens = [token.text for token in message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])]
+        tokens = [token.text for token in message.get(TOKENS_NAMES[TEXT], [])]
         # return tokens without CLS token
         return tokens[:-1]
 
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index bd6f6f424255..82399446a76d 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -11,7 +11,7 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT
 from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
@@ -28,7 +28,7 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
+    requires = [DENSE_FEATURE_NAMES[TEXT]]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
@@ -105,7 +105,7 @@ def train(
             X = np.stack(
                 [
                     sequence_to_sentence_features(
-                        example.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+                        example.get(DENSE_FEATURE_NAMES[TEXT])
                     )
                     for example in training_data.intent_examples
                 ]
@@ -165,7 +165,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent_ranking = []
         else:
             X = sequence_to_sentence_features(
-                message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+                message.get(DENSE_FEATURE_NAMES[TEXT])
             ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 6667f98bf9ff..1bffbbc98032 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -4,7 +4,7 @@
 from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
 
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults
-from rasa.nlu.constants import RESPONSE_ATTRIBUTE
+from rasa.nlu.constants import RESPONSE
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
 
@@ -121,7 +121,7 @@ def validate_required_components_from_data(
     response_selector_exists = False
     for component in pipeline:
         # check if a response selector is part of NLU pipeline
-        if RESPONSE_ATTRIBUTE in component.provides:
+        if RESPONSE in component.provides:
             response_selector_exists = True
 
     if len(data.response_examples) and not response_selector_exists:
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 534ba66d87b6..dcfa4037d4d0 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -1,43 +1,43 @@
-TEXT_ATTRIBUTE = "text"
+TEXT = "text"
 
 RESPONSE_KEY_ATTRIBUTE = "response_key"
 
-INTENT_ATTRIBUTE = "intent"
+INTENT = "intent"
 
-RESPONSE_ATTRIBUTE = "response"
+RESPONSE = "response"
 
-ENTITIES_ATTRIBUTE = "entities"
-BILOU_ENTITIES_ATTRIBUTE = "bilou_entities"
+ENTITIES = "entities"
+BILOU_ENTITIES = "bilou_entities"
 
-EXTRACTOR_ATTRIBUTE = "extractor"
+EXTRACTOR = "extractor"
 
 PRETRAINED_EXTRACTORS = {"DucklingHTTPExtractor", "SpacyEntityExtractor"}
 
 CLS_TOKEN = "__CLS__"
 
-MESSAGE_ATTRIBUTES = [TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
+MESSAGE_ATTRIBUTES = [TEXT, INTENT, RESPONSE]
 
 TOKENS_NAMES = {
-    TEXT_ATTRIBUTE: "tokens",
-    INTENT_ATTRIBUTE: "intent_tokens",
-    RESPONSE_ATTRIBUTE: "response_tokens",
+    TEXT: "tokens",
+    INTENT: "intent_tokens",
+    RESPONSE: "response_tokens",
 }
 
 SPARSE_FEATURE_NAMES = {
-    TEXT_ATTRIBUTE: "text_sparse_features",
-    INTENT_ATTRIBUTE: "intent_sparse_features",
-    RESPONSE_ATTRIBUTE: "response_sparse_features",
+    TEXT: "text_sparse_features",
+    INTENT: "intent_sparse_features",
+    RESPONSE: "response_sparse_features",
 }
 
 DENSE_FEATURE_NAMES = {
-    TEXT_ATTRIBUTE: "text_dense_features",
-    INTENT_ATTRIBUTE: "intent_dense_features",
-    RESPONSE_ATTRIBUTE: "response_dense_features",
+    TEXT: "text_dense_features",
+    INTENT: "intent_dense_features",
+    RESPONSE: "response_dense_features",
 }
 
-SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
+SPACY_DOCS = {TEXT: "spacy_doc", RESPONSE: "response_spacy_doc"}
 
-DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
+DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE]
 
 RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector"
 DEFAULT_OPEN_UTTERANCE_TYPE = "default"
diff --git a/rasa/nlu/extractors/__init__.py b/rasa/nlu/extractors/__init__.py
index c2e01d764ec1..bcdf16ffb366 100644
--- a/rasa/nlu/extractors/__init__.py
+++ b/rasa/nlu/extractors/__init__.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Text, Tuple
 
 from rasa.nlu.components import Component
-from rasa.nlu.constants import EXTRACTOR_ATTRIBUTE, ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import EXTRACTOR, ENTITIES
 from rasa.nlu.training_data import Message
 
 
@@ -10,7 +10,7 @@ def add_extractor_name(
         self, entities: List[Dict[Text, Any]]
     ) -> List[Dict[Text, Any]]:
         for entity in entities:
-            entity[EXTRACTOR_ATTRIBUTE] = self.name
+            entity[EXTRACTOR] = self.name
         return entities
 
     def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
@@ -72,12 +72,12 @@ def filter_trainable_entities(
         filtered = []
         for message in entity_examples:
             entities = []
-            for ent in message.get(ENTITIES_ATTRIBUTE, []):
-                extractor = ent.get(EXTRACTOR_ATTRIBUTE)
+            for ent in message.get(ENTITIES, []):
+                extractor = ent.get(EXTRACTOR)
                 if not extractor or extractor == self.name:
                     entities.append(ent)
             data = message.data.copy()
-            data[ENTITIES_ATTRIBUTE] = entities
+            data[ENTITIES] = entities
             filtered.append(
                 Message(
                     text=message.text,
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 4c52482dbc38..12479aec7f66 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -10,18 +10,14 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.constants import DOCS_BASE_URL
-from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
-    ENTITIES_ATTRIBUTE,
-    DENSE_FEATURE_NAMES,
-    SPARSE_FEATURE_NAMES,
+    TEXT,
+    ENTITIES,
     TOKENS_NAMES,
 )
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_TEXT,
-    SHARE_HIDDEN_LAYERS,
+    HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
     BATCH_SIZES,
     BATCH_STRATEGY,
@@ -35,7 +31,6 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    UNIDIRECTIONAL_ENCODER,
     DROPRATE,
     C2,
     BILOU_FLAG,
@@ -48,9 +43,9 @@
 
 class CRFEntityExtractor(DIETClassifier):
 
-    provides = [ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES]
 
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    requires = [TOKENS_NAMES[TEXT]]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
@@ -79,7 +74,7 @@ class CRFEntityExtractor(DIETClassifier):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_TEXT: [256, 128],
+        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128]},
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -94,7 +89,7 @@ class CRFEntityExtractor(DIETClassifier):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {"text": 512, "label": 20},
+        DENSE_DIM: {TEXT: 512},
         # regularization parameters
         # the scale of L2 regularization
         C2: 0.002,
@@ -130,7 +125,6 @@ def __init__(
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
         component_config[NUM_TRANSFORMER_LAYERS] = 0
-        component_config[SHARE_HIDDEN_LAYERS] = False
 
         super().__init__(
             component_config,
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
index ec57abfd4eb2..dbc3327286c2 100644
--- a/rasa/nlu/extractors/duckling_http_extractor.py
+++ b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -6,7 +6,7 @@
 from typing import Any, List, Optional, Text, Dict
 
 from rasa.constants import DOCS_URL_COMPONENTS
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -53,7 +53,7 @@ def convert_duckling_format_to_rasa(
 class DucklingHTTPExtractor(EntityExtractor):
     """Searches for structured entites, e.g. dates, using a duckling server."""
 
-    provides = [ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES]
 
     defaults = {
         # by default all dimensions recognized by duckling are returned
@@ -189,9 +189,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(extracted)
         message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @classmethod
diff --git a/rasa/nlu/extractors/entity_synonyms.py b/rasa/nlu/extractors/entity_synonyms.py
index eb8fdc1e1102..3a62d6a761b3 100644
--- a/rasa/nlu/extractors/entity_synonyms.py
+++ b/rasa/nlu/extractors/entity_synonyms.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, Optional, Text
 
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -15,7 +15,7 @@
 
 class EntitySynonymMapper(EntityExtractor):
 
-    provides = [ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES]
 
     def __init__(
         self,
@@ -38,15 +38,15 @@ def train(
             self.add_entities_if_synonyms(key, value)
 
         for example in training_data.entity_examples:
-            for entity in example.get(ENTITIES_ATTRIBUTE, []):
+            for entity in example.get(ENTITIES, []):
                 entity_val = example.text[entity["start"] : entity["end"]]
                 self.add_entities_if_synonyms(entity_val, str(entity.get("value")))
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        updated_entities = message.get(ENTITIES_ATTRIBUTE, [])[:]
+        updated_entities = message.get(ENTITIES, [])[:]
         self.replace_synonyms(updated_entities)
-        message.set(ENTITIES_ATTRIBUTE, updated_entities, add_to_output=True)
+        message.set(ENTITIES, updated_entities, add_to_output=True)
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
 
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 332423c024d2..92573301a169 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -4,7 +4,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text
 
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE, TOKENS_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES, TOKENS_NAMES, TEXT
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -20,9 +20,9 @@
 
 class MitieEntityExtractor(EntityExtractor):
 
-    provides = [ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES]
 
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE], "mitie_feature_extractor", "mitie_file"]
+    requires = [TOKENS_NAMES[TEXT], "mitie_feature_extractor", "mitie_file"]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None, ner=None):
         """Construct a new intent classifier using the sklearn framework."""
@@ -37,7 +37,7 @@ def required_packages(cls) -> List[Text]:
     @staticmethod
     def _tokens_without_cls(message: Message) -> List[Token]:
         # [:-1] to remove the CLS token from the list of tokens
-        return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+        return message.get(TOKENS_NAMES[TEXT])[:-1]
 
     def extract_entities(
         self, text: Text, tokens: List[Token], feature_extractor
@@ -104,7 +104,7 @@ def _prepare_mitie_sample(self, training_example: Message) -> Any:
         text = training_example.text
         tokens = self._tokens_without_cls(training_example)
         sample = mitie.ner_training_instance([t.text for t in tokens])
-        for ent in training_example.get(ENTITIES_ATTRIBUTE, []):
+        for ent in training_example.get(ENTITIES, []):
             try:
                 # if the token is not aligned an exception will be raised
                 start, end = MitieEntityExtractor.find_entity(ent, text, tokens)
@@ -143,9 +143,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         )
         extracted = self.add_extractor_name(ents)
         message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @classmethod
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
index 8c0bf9e79322..2a90c0bd6f63 100644
--- a/rasa/nlu/extractors/spacy_entity_extractor.py
+++ b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -1,7 +1,7 @@
 import typing
 from typing import Any, Dict, List, Text, Optional
 
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.training_data import Message
 
@@ -11,7 +11,7 @@
 
 class SpacyEntityExtractor(EntityExtractor):
 
-    provides = [ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES]
 
     requires = ["spacy_nlp"]
 
@@ -36,9 +36,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             all_extracted, dimensions
         )
         message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @staticmethod
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 5914edb6f0df..9c51ca349554 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -8,7 +8,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
@@ -53,7 +53,7 @@ def required_packages(cls) -> List[Text]:
         return ["tensorflow_text", "tensorflow_hub"]
 
     def _compute_features(
-        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT
     ) -> np.ndarray:
 
         sentence_encodings = self._compute_sentence_encodings(batch_examples, attribute)
@@ -68,7 +68,7 @@ def _compute_features(
         )
 
     def _compute_sentence_encodings(
-        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT
     ) -> np.ndarray:
         # Get text for attribute of each example
         batch_attribute_text = [ex.get(attribute) for ex in batch_examples]
@@ -78,7 +78,7 @@ def _compute_sentence_encodings(
         return np.reshape(sentence_encodings, (len(batch_examples), 1, -1))
 
     def _compute_sequence_encodings(
-        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT
     ) -> Tuple[np.ndarray, List[int]]:
         list_of_tokens = [
             example.get(TOKENS_NAMES[attribute]) for example in batch_examples
@@ -215,8 +215,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         features = self._compute_features([message])[0]
         message.set(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+            DENSE_FEATURE_NAMES[TEXT],
             self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+                message, features, DENSE_FEATURE_NAMES[TEXT]
             ),
         )
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 6aeccac04625..dcbc04355a3a 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -11,7 +11,7 @@
     import mitie
 
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     DENSE_FEATURE_NAMES,
@@ -82,12 +82,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
         features = self.features_for_tokens(
-            message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), mitie_feature_extractor
+            message.get(TOKENS_NAMES[TEXT]), mitie_feature_extractor
         )
         message.set(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+            DENSE_FEATURE_NAMES[TEXT],
             self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+                message, features, DENSE_FEATURE_NAMES[TEXT]
             ),
         )
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 53acff7937a0..901ad394e345 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -10,7 +10,7 @@
     from spacy.tokens import Doc
 
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     SPACY_DOCS,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
@@ -62,7 +62,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
 
-    def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
+    def _set_spacy_features(self, message: Message, attribute: Text = TEXT):
         """Adds the spacy word vectors to the messages features."""
 
         message_attribute_doc = self.get_doc(message, attribute)
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index 3dd94dce44bf..b78187c62c47 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -3,7 +3,7 @@
 from typing import Any, Text, Union, Optional
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
-from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT
 
 
 def sequence_to_sentence_features(
@@ -28,7 +28,7 @@ class Featurizer(Component):
     def _combine_with_existing_dense_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+        feature_name: Text = DENSE_FEATURE_NAMES[TEXT],
     ) -> Any:
         if message.get(feature_name) is not None:
 
@@ -51,7 +51,7 @@ def _combine_with_existing_dense_features(
     def _combine_with_existing_sparse_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT],
     ) -> Any:
         if additional_features is None:
             return
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 3b9062006529..d73f4adafbb0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -14,13 +14,13 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     SPARSE_FEATURE_NAMES,
-    INTENT_ATTRIBUTE,
+    INTENT,
     DENSE_FEATURIZABLE_ATTRIBUTES,
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
 )
 
 logger = logging.getLogger(__name__)
@@ -222,12 +222,10 @@ def _get_message_tokens_by_attribute(
 
         return message.get(attribute).split()
 
-    def _process_tokens(
-        self, tokens: List[Text], attribute: Text = TEXT_ATTRIBUTE
-    ) -> List[Text]:
+    def _process_tokens(self, tokens: List[Text], attribute: Text = TEXT) -> List[Text]:
         """Apply processing and cleaning steps to text"""
 
-        if attribute == INTENT_ATTRIBUTE:
+        if attribute == INTENT:
             # Don't do any processing for intent attribute. Treat them as whole labels
             return tokens
 
@@ -264,7 +262,7 @@ def _replace_with_oov_token(
         return tokens
 
     def _get_processed_message_tokens_by_attribute(
-        self, message: Message, attribute: Text = TEXT_ATTRIBUTE
+        self, message: Message, attribute: Text = TEXT
     ) -> List[Text]:
         """Get processed text of attribute of a message"""
 
@@ -327,7 +325,7 @@ def _convert_attribute_tokens_to_texts(
 
         for attribute in attribute_tokens.keys():
             list_of_tokens = attribute_tokens[attribute]
-            if attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+            if attribute in [RESPONSE, TEXT]:
                 # vocabulary should not contain CLS token
                 list_of_tokens = [tokens[:-1] for tokens in list_of_tokens]
             attribute_texts[attribute] = [" ".join(tokens) for tokens in list_of_tokens]
@@ -357,7 +355,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
             combined_cleaned_texts += attribute_texts[attribute]
 
         try:
-            self.vectorizers[TEXT_ATTRIBUTE].fit(combined_cleaned_texts)
+            self.vectorizers[TEXT].fit(combined_cleaned_texts)
         except ValueError:
             logger.warning(
                 "Unable to train a shared CountVectorizer. "
@@ -418,13 +416,13 @@ def _create_sequence(
             # set input to list of tokens if sequence should be returned
             # otherwise join all tokens to a single string and pass that as a list
             tokens_without_cls = tokens
-            if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
+            if attribute in [TEXT, RESPONSE]:
                 tokens_without_cls = tokens[:-1]
 
             seq_vec = self.vectorizers[attribute].transform(tokens_without_cls)
             seq_vec.sort_indices()
 
-            if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
+            if attribute in [TEXT, RESPONSE]:
                 tokens_text = [" ".join(tokens_without_cls)]
                 cls_vec = self.vectorizers[attribute].transform(tokens_text)
                 cls_vec.sort_indices()
@@ -515,7 +513,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             )
             return
 
-        attribute = TEXT_ATTRIBUTE
+        attribute = TEXT
         message_tokens = self._get_processed_message_tokens_by_attribute(
             message, attribute
         )
@@ -566,7 +564,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
                 if self.use_shared_vocab:
                     # Only persist vocabulary from one attribute. Can be loaded and
                     # distributed to all attributes.
-                    vocab = attribute_vocabularies[TEXT_ATTRIBUTE]
+                    vocab = attribute_vocabularies[TEXT]
                 else:
                     vocab = attribute_vocabularies
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 5e8e1b5cb917..4a003c6747fb 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -12,7 +12,7 @@
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES
 from rasa.nlu.model import Metadata
 
 logger = logging.getLogger(__name__)
@@ -20,9 +20,9 @@
 
 class LexicalSyntacticFeaturizer(Featurizer):
 
-    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
+    provides = [SPARSE_FEATURE_NAMES[TEXT]]
 
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    requires = [TOKENS_NAMES[TEXT]]
 
     defaults = {
         # 'features' is [before, word, after] array with before, word,
@@ -113,7 +113,7 @@ def _create_feature_to_idx_dict(
         all_features = []
         for example in training_data.training_examples:
             # [:-1] to remove CLS token
-            tokens = example.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+            tokens = example.get(TOKENS_NAMES[TEXT])[:-1]
             all_features.append(self._tokens_to_features(tokens))
 
         # build vocabulary of features
@@ -161,7 +161,7 @@ def _create_sparse_features(self, message: Message) -> None:
         features."""
 
         # [:-1] to remove CLS token
-        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+        tokens = message.get(TOKENS_NAMES[TEXT])[:-1]
 
         sentence_features = self._tokens_to_features(tokens)
         one_hot_feature_vector = self._features_to_one_hot(sentence_features)
@@ -169,9 +169,9 @@ def _create_sparse_features(self, message: Message) -> None:
         sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)
 
         sparse_features = self._combine_with_existing_sparse_features(
-            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT]
         )
-        message.set(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE], sparse_features)
+        message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features)
 
     def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
         """Convert words into discrete features."""
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 4012a8bbc9e0..379931756dd0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -14,9 +14,9 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.constants import (
     CLS_TOKEN,
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
     SPARSE_FEATURE_NAMES,
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
 )
 from rasa.nlu.featurizers.featurizer import Featurizer
@@ -29,9 +29,9 @@
 
 class RegexFeaturizer(Featurizer):
 
-    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
+    provides = [SPARSE_FEATURE_NAMES[TEXT]]
 
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    requires = [TOKENS_NAMES[TEXT]]
 
     def __init__(
         self,
@@ -57,11 +57,11 @@ def train(
         self._add_lookup_table_regexes(training_data.lookup_tables)
 
         for example in training_data.training_examples:
-            for attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
+            for attribute in [TEXT, RESPONSE]:
                 self._text_features_with_regex(example, attribute)
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        self._text_features_with_regex(message, TEXT_ATTRIBUTE)
+        self._text_features_with_regex(message, TEXT)
 
     def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
         if self.known_patterns:
@@ -116,7 +116,7 @@ def _features_for_patterns(
                     if t.start < match.end() and t.end > match.start():
                         patterns[pattern["name"]] = True
                         vec[token_index][pattern_index] = 1.0
-                        if attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+                        if attribute in [RESPONSE, TEXT]:
                             # CLS token vector should contain all patterns
                             vec[-1][pattern_index] = 1.0
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index b99568a94a98..e54232d726c8 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -9,8 +9,8 @@
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_TEXT,
-    HIDDEN_LAYERS_SIZES_LABEL,
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
@@ -44,11 +44,11 @@
     BILOU_FLAG,
 )
 from rasa.nlu.constants import (
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
     DEFAULT_OPEN_UTTERANCE_TYPE,
     DENSE_FEATURE_NAMES,
-    TEXT_ATTRIBUTE,
+    TEXT,
     SPARSE_FEATURE_NAMES,
 )
 from rasa.utils.tensorflow.tf_model_data import RasaModelData
@@ -76,27 +76,20 @@ class ResponseSelector(DIETClassifier):
     and additional hidden layers are added together with dropout.
     """
 
-    provides = [RESPONSE_ATTRIBUTE, "response_ranking"]
+    provides = [RESPONSE, "response_ranking"]
 
     requires = [
-        any_of(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        ),
-        any_of(
-            DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE],
-            SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE],
-        ),
+        any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT]),
+        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE],),
     ]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # sizes of hidden layers before the embedding layer for input words
+        # sizes of hidden layers before the embedding layer
+        # for input words and responses
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_TEXT: [],
-        # sizes of hidden layers before the embedding layer for intent labels
-        # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES_LABEL: [],
+        HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
         # Whether to share the hidden layer weights between input words and intent labels
         SHARE_HIDDEN_LAYERS: False,
         # number of units in transformer
@@ -121,7 +114,7 @@ class ResponseSelector(DIETClassifier):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {"text": 512, "label": 20},
+        DENSE_DIM: {TEXT: 512, LABEL: 512},
         # dimension size of embedding vectors
         EMBED_DIM: 20,
         # the type of the similarity
@@ -231,19 +224,15 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         if self.retrieval_intent:
             training_data = training_data.filter_by_intent(self.retrieval_intent)
 
-        label_id_dict = self._create_label_id_dict(
-            training_data, attribute=RESPONSE_ATTRIBUTE
-        )
+        label_id_dict = self._create_label_id_dict(training_data, attribute=RESPONSE)
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
 
         self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=RESPONSE_ATTRIBUTE
+            training_data, label_id_dict, attribute=RESPONSE
         )
 
         model_data = self._create_model_data(
-            training_data.intent_examples,
-            label_id_dict,
-            label_attribute=RESPONSE_ATTRIBUTE,
+            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE,
         )
 
         self.check_input_dimension_consistency(model_data)
@@ -288,6 +277,9 @@ def _update_metrics_to_log(self) -> None:
         self.metrics_to_log += ["r_loss", "r_acc"]
 
     def _prepare_layers(self) -> None:
+        self.text_name = TEXT
+        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
+
         self._prepare_sequence_layers(self.text_name)
         self._prepare_sequence_layers(self.label_name)
         if self.config[MASKED_LM]:
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index f842b87f854a..efe71721906b 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -25,7 +25,7 @@
     DEFAULT_OPEN_UTTERANCE_TYPE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
     OPEN_UTTERANCE_PREDICTION_KEY,
-    EXTRACTOR_ATTRIBUTE,
+    EXTRACTOR,
     PRETRAINED_EXTRACTORS,
 )
 from rasa.model import get_model
@@ -872,7 +872,7 @@ def align_entity_predictions(
         extractor: [] for extractor in extractors
     }
     for p in result.entity_predictions:
-        entities_by_extractors[p[EXTRACTOR_ATTRIBUTE]].append(p)
+        entities_by_extractors[p[EXTRACTOR]].append(p)
     extractor_labels: Dict[Text, List] = {extractor: [] for extractor in extractors}
     for t in result.tokens:
         true_token_labels.append(determine_token_labels(t, result.entity_targets, None))
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 30a45d0fd321..db976be5a2bd 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -7,12 +7,12 @@
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.nlu.components import Component
 from rasa.nlu.constants import (
-    RESPONSE_ATTRIBUTE,
-    TEXT_ATTRIBUTE,
+    RESPONSE,
+    TEXT,
     CLS_TOKEN,
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    INTENT_ATTRIBUTE,
+    INTENT,
 )
 
 logger = logging.getLogger(__name__)
@@ -90,7 +90,7 @@ def train(
         for example in training_data.training_examples:
             for attribute in MESSAGE_ATTRIBUTES:
                 if example.get(attribute) is not None:
-                    if attribute == INTENT_ATTRIBUTE:
+                    if attribute == INTENT:
                         tokens = self._split_intent(example)
                     else:
                         tokens = self.tokenize(example, attribute)
@@ -100,12 +100,12 @@ def train(
     def process(self, message: Message, **kwargs: Any) -> None:
         """Tokenize the incoming message."""
 
-        tokens = self.tokenize(message, TEXT_ATTRIBUTE)
-        tokens = self.add_cls_token(tokens, TEXT_ATTRIBUTE)
-        message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
+        tokens = self.tokenize(message, TEXT)
+        tokens = self.add_cls_token(tokens, TEXT)
+        message.set(TOKENS_NAMES[TEXT], tokens)
 
     def _split_intent(self, message: Message):
-        text = message.get(INTENT_ATTRIBUTE)
+        text = message.get(INTENT)
 
         words = (
             text.split(self.intent_split_symbol)
@@ -130,7 +130,7 @@ def _convert_words_to_tokens(words: List[Text], text: Text) -> List[Token]:
 
     @staticmethod
     def add_cls_token(tokens: List[Token], attribute: Text) -> List[Token]:
-        if attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE] and tokens:
+        if attribute in [RESPONSE, TEXT] and tokens:
             # +1 to have a space between the last token and the __cls__ token
             idx = tokens[-1].end + 1
             tokens.append(Token(CLS_TOKEN, idx))
diff --git a/rasa/nlu/training_data/formats/markdown.py b/rasa/nlu/training_data/formats/markdown.py
index 3b27facce2cb..95c630665421 100644
--- a/rasa/nlu/training_data/formats/markdown.py
+++ b/rasa/nlu/training_data/formats/markdown.py
@@ -11,7 +11,7 @@
     TrainingDataWriter,
 )
 from rasa.nlu.utils import build_entity
-from rasa.nlu.constants import INTENT_ATTRIBUTE
+from rasa.nlu.constants import INTENT
 
 
 if typing.TYPE_CHECKING:
@@ -218,7 +218,7 @@ def _generate_training_examples_md(self, training_data: "TrainingData") -> Text:
         # Sort by intent while keeping basic intent order
         for example in [e.as_dict_nlu() for e in training_data.training_examples]:
             rasa_nlu_training_data_utils.remove_untrainable_entities_from(example)
-            intent = example[INTENT_ATTRIBUTE]
+            intent = example[INTENT]
             training_examples.setdefault(intent, [])
             training_examples[intent].append(example)
 
diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py
index 7a661a748923..c161c1fa01ff 100644
--- a/rasa/nlu/training_data/message.py
+++ b/rasa/nlu/training_data/message.py
@@ -1,11 +1,11 @@
 from typing import Any, Optional, Tuple, Text
 
 from rasa.nlu.constants import (
-    ENTITIES_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    ENTITIES,
+    INTENT,
+    RESPONSE,
     RESPONSE_KEY_ATTRIBUTE,
-    TEXT_ATTRIBUTE,
+    TEXT,
     RESPONSE_IDENTIFIER_DELIMITER,
 )
 from rasa.nlu.utils import ordered
@@ -30,7 +30,7 @@ def set(self, prop, info, add_to_output=False) -> None:
             self.output_properties.add(prop)
 
     def get(self, prop, default=None) -> Any:
-        if prop == TEXT_ATTRIBUTE:
+        if prop == TEXT:
             return self.text
         return self.data.get(prop, default)
 
@@ -38,10 +38,10 @@ def as_dict_nlu(self) -> dict:
         """Get dict representation of message as it would appear in training data"""
 
         d = self.as_dict()
-        if d.get(INTENT_ATTRIBUTE, None):
-            d[INTENT_ATTRIBUTE] = self.get_combined_intent_response_key()
+        if d.get(INTENT, None):
+            d[INTENT] = self.get_combined_intent_response_key()
         d.pop(RESPONSE_KEY_ATTRIBUTE, None)
-        d.pop(RESPONSE_ATTRIBUTE, None)
+        d.pop(RESPONSE, None)
         return d
 
     def as_dict(self, only_output_properties=False) -> dict:
@@ -73,17 +73,17 @@ def build(cls, text, intent=None, entities=None) -> "Message":
         data = {}
         if intent:
             split_intent, response_key = cls.separate_intent_response_key(intent)
-            data[INTENT_ATTRIBUTE] = split_intent
+            data[INTENT] = split_intent
             if response_key:
                 data[RESPONSE_KEY_ATTRIBUTE] = response_key
         if entities:
-            data[ENTITIES_ATTRIBUTE] = entities
+            data[ENTITIES] = entities
         return cls(text, data)
 
     def get_combined_intent_response_key(self) -> Text:
         """Get intent as it appears in training data"""
 
-        intent = self.get(INTENT_ATTRIBUTE)
+        intent = self.get(INTENT)
         response_key = self.get(RESPONSE_KEY_ATTRIBUTE)
         response_key_suffix = (
             f"{RESPONSE_IDENTIFIER_DELIMITER}{response_key}" if response_key else ""
diff --git a/rasa/nlu/training_data/training_data.py b/rasa/nlu/training_data/training_data.py
index e542734ea1f0..e3e97d126820 100644
--- a/rasa/nlu/training_data/training_data.py
+++ b/rasa/nlu/training_data/training_data.py
@@ -8,7 +8,7 @@
 
 import rasa.nlu.utils
 from rasa.utils.common import raise_warning, lazy_property
-from rasa.nlu.constants import RESPONSE_ATTRIBUTE, RESPONSE_KEY_ATTRIBUTE
+from rasa.nlu.constants import RESPONSE, RESPONSE_KEY_ATTRIBUTE
 from rasa.nlu.training_data.message import Message
 from rasa.nlu.training_data.util import check_duplicate_synonym
 from rasa.nlu.utils import list_to_str
@@ -186,7 +186,7 @@ def fill_response_phrases(self) -> None:
                 assistant_utterances = self.nlg_stories.get(story_lookup_intent, [])
                 if assistant_utterances:
                     # selecting only first assistant utterance for now
-                    example.set(RESPONSE_ATTRIBUTE, assistant_utterances[0])
+                    example.set(RESPONSE, assistant_utterances[0])
                 else:
                     raise ValueError(
                         "No response phrases found for {}. Check training data "
@@ -384,10 +384,8 @@ def build_nlg_stories_from_examples(examples) -> Dict[Text, list]:
 
         nlg_stories = {}
         for ex in examples:
-            if ex.get(RESPONSE_KEY_ATTRIBUTE) and ex.get(RESPONSE_ATTRIBUTE):
-                nlg_stories[ex.get_combined_intent_response_key()] = [
-                    ex.get(RESPONSE_ATTRIBUTE)
-                ]
+            if ex.get(RESPONSE_KEY_ATTRIBUTE) and ex.get(RESPONSE):
+                nlg_stories[ex.get_combined_intent_response_key()] = [ex.get(RESPONSE)]
         return nlg_stories
 
     def split_nlu_examples(
diff --git a/rasa/nlu/training_data/util.py b/rasa/nlu/training_data/util.py
index 1294434518c7..bb189a3bb4a2 100644
--- a/rasa/nlu/training_data/util.py
+++ b/rasa/nlu/training_data/util.py
@@ -5,8 +5,8 @@
 
 import rasa.utils.io as io_utils
 from rasa.nlu.constants import (
-    ENTITIES_ATTRIBUTE,
-    EXTRACTOR_ATTRIBUTE,
+    ENTITIES,
+    EXTRACTOR,
     PRETRAINED_EXTRACTORS,
 )
 from rasa.utils.common import raise_warning
@@ -67,7 +67,7 @@ def remove_untrainable_entities_from(example: Dict[Text, Any]) -> None:
         example: Serialised training example to inspect.
     """
 
-    example_entities = example.get(ENTITIES_ATTRIBUTE)
+    example_entities = example.get(ENTITIES)
 
     if not example_entities:
         # example contains no entities, so there's nothing to do
@@ -76,7 +76,7 @@ def remove_untrainable_entities_from(example: Dict[Text, Any]) -> None:
     trainable_entities = []
 
     for entity in example_entities:
-        if entity.get(EXTRACTOR_ATTRIBUTE) in PRETRAINED_EXTRACTORS:
+        if entity.get(EXTRACTOR) in PRETRAINED_EXTRACTORS:
             logger.debug(
                 f"Excluding entity '{json.dumps(entity)}' from training data. "
                 f"Entity examples extracted by the following classes are not "
@@ -86,4 +86,4 @@ def remove_untrainable_entities_from(example: Dict[Text, Any]) -> None:
         else:
             trainable_entities.append(entity)
 
-    example[ENTITIES_ATTRIBUTE] = trainable_entities
+    example[ENTITIES] = trainable_entities
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index c6f5e87d00b4..73efd6faaecb 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -4,10 +4,10 @@
 from rasa.nlu.training_data import Message
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.constants import (
-    ENTITIES_ATTRIBUTE,
+    ENTITIES,
     TOKENS_NAMES,
-    TEXT_ATTRIBUTE,
-    BILOU_ENTITIES_ATTRIBUTE,
+    TEXT,
+    BILOU_ENTITIES,
 )
 
 BILOU_PREFIXES = ["B-", "I-", "U-", "L-"]
@@ -22,13 +22,13 @@ def entity_name_from_tag(tag: Text) -> Text:
 
 def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
     """Maps the entity tags of the message to the ids of the provided dict."""
-    if message.get(BILOU_ENTITIES_ATTRIBUTE):
+    if message.get(BILOU_ENTITIES):
         _tags = [
             tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict["O"]
-            for _tag in message.get(BILOU_ENTITIES_ATTRIBUTE)
+            for _tag in message.get(BILOU_ENTITIES)
         ]
     else:
-        _tags = [tag_id_dict["O"] for _ in message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])]
+        _tags = [tag_id_dict["O"] for _ in message.get(TOKENS_NAMES[TEXT])]
 
     return _tags
 
@@ -44,8 +44,8 @@ def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
         [
             entity_name_from_tag(e)
             for example in training_data.training_examples
-            if example.get(BILOU_ENTITIES_ATTRIBUTE)
-            for e in example.get(BILOU_ENTITIES_ATTRIBUTE)
+            if example.get(BILOU_ENTITIES)
+            for e in example.get(BILOU_ENTITIES)
         ]
     ) - {"O"}
 
@@ -63,17 +63,15 @@ def apply_bilou_schema(training_data: TrainingData):
     """Obtains a list of BILOU entity tags and sets them on the corresponding
     message."""
     for message in training_data.training_examples:
-        entities = message.get(ENTITIES_ATTRIBUTE)
+        entities = message.get(ENTITIES)
 
         if not entities:
             continue
 
         entities = _map_message_entities(message)
-        output = _bilou_tags_from_offsets(
-            message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), entities
-        )
+        output = _bilou_tags_from_offsets(message.get(TOKENS_NAMES[TEXT]), entities)
 
-        message.set(BILOU_ENTITIES_ATTRIBUTE, output)
+        message.set(BILOU_ENTITIES, output)
 
 
 def _map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
@@ -82,7 +80,7 @@ def _map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
     def convert_entity(entity):
         return entity["start"], entity["end"], entity["entity"]
 
-    return [convert_entity(entity) for entity in message.get(ENTITIES_ATTRIBUTE, [])]
+    return [convert_entity(entity) for entity in message.get(ENTITIES, [])]
 
 
 def _bilou_tags_from_offsets(
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index f06ff5f7459f..25fd7e0e9f24 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -14,7 +14,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
     from rasa.nlu.model import Metadata
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import TEXT, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
 
 
 class SpacyNLP(Component):
@@ -229,7 +229,7 @@ def train(
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(SPACY_DOCS[TEXT_ATTRIBUTE], self.doc_for_text(message.text))
+        message.set(SPACY_DOCS[TEXT], self.doc_for_text(message.text))
 
     @classmethod
     def load(
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index bbc959acca92..88a2ad7e4328 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -1,8 +1,7 @@
 # constants - configuration parameters
 
-HIDDEN_LAYERS_SIZES_TEXT = "hidden_layers_sizes_text"
-HIDDEN_LAYERS_SIZES_LABEL = "hidden_layers_sizes_label"
-HIDDEN_LAYERS_SIZES_DIALOGUE = "hidden_layers_sizes_dialogue"
+LABEL = "label"
+HIDDEN_LAYERS_SIZES = "hidden_layers_sizes"
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
 
 TRANSFORMER_SIZE = "transformer_size"
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2ce1bd0ef39e..5d97cf34cef7 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,11 +1,13 @@
 import logging
 import tensorflow as tf
 import numpy as np
-from typing import Optional, Text, Dict, Any
+from typing import Optional, Text, Dict, Any, Union, List
+from rasa.core.constants import DIALOGUE
+from rasa.nlu.constants import TEXT
 
 from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES_TEXT,
-    HIDDEN_LAYERS_SIZES_LABEL,
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
     MAX_SEQ_LENGTH,
@@ -20,7 +22,6 @@
     MU_NEG,
     MU_POS,
     EMBED_DIM,
-    HIDDEN_LAYERS_SIZES_DIALOGUE,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
 )
@@ -65,14 +66,24 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
 
 def _replace_deprecated_option(
-    old_option: Text, new_option: Text, config: Dict[Text, Any]
+    old_option: Text, new_option: Union[Text, List[Text]], config: Dict[Text, Any]
 ) -> Dict[Text, Any]:
     if old_option in config:
-        logger.warning(
-            f"Option '{old_option}' got renamed to '{new_option}'. "
-            f"Please update your configuration file."
-        )
-        config[new_option] = config[old_option]
+        if isinstance(new_option, str):
+            logger.warning(
+                f"Option '{old_option}' got renamed to '{new_option}'. "
+                f"Please update your configuration file."
+            )
+            config[new_option] = config[old_option]
+        else:
+            logger.warning(
+                f"Option '{old_option}' got renamed to "
+                f"a dictionary '{new_option[0]}' with a key '{new_option[1]}'. "
+                f"Please update your configuration file."
+            )
+            option_dict = config.get(new_option[0], {})
+            option_dict[new_option[1]] = config[old_option]
+            config[new_option[0]] = option_dict
 
     return config
 
@@ -80,18 +91,18 @@ def _replace_deprecated_option(
 def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
     config = _replace_deprecated_option(
-        "hidden_layers_sizes_pre_dial", HIDDEN_LAYERS_SIZES_DIALOGUE, config
+        "hidden_layers_sizes_pre_dial", [HIDDEN_LAYERS_SIZES, DIALOGUE], config
     )
     config = _replace_deprecated_option(
-        "hidden_layers_sizes_bot", HIDDEN_LAYERS_SIZES_LABEL, config
+        "hidden_layers_sizes_bot", [HIDDEN_LAYERS_SIZES, LABEL], config
     )
     config = _replace_deprecated_option("droprate_a", DROPRATE_DIALOGUE, config)
     config = _replace_deprecated_option("droprate_b", DROPRATE_LABEL, config)
     config = _replace_deprecated_option(
-        "hidden_layers_sizes_a", HIDDEN_LAYERS_SIZES_TEXT, config
+        "hidden_layers_sizes_a", [HIDDEN_LAYERS_SIZES, TEXT], config
     )
     config = _replace_deprecated_option(
-        "hidden_layers_sizes_b", HIDDEN_LAYERS_SIZES_LABEL, config
+        "hidden_layers_sizes_b", [HIDDEN_LAYERS_SIZES, LABEL], config
     )
     config = _replace_deprecated_option(
         "num_transformer_layers", NUM_TRANSFORMER_LAYERS, config
diff --git a/tests/nlu/base/test_training_data.py b/tests/nlu/base/test_training_data.py
index 9db3f2d56f8d..42ed6e39795b 100644
--- a/tests/nlu/base/test_training_data.py
+++ b/tests/nlu/base/test_training_data.py
@@ -5,7 +5,7 @@
 import tempfile
 from jsonschema import ValidationError
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE
+from rasa.nlu.constants import TEXT
 from rasa.nlu import training_data
 from rasa.nlu.convert import convert_training_data
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
@@ -275,7 +275,7 @@ def test_repeated_entities():
         example = td.entity_examples[0]
         entities = example.get("entities")
         assert len(entities) == 1
-        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT_ATTRIBUTE)
+        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT)
         start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens)
         assert start == 9
         assert end == 10
@@ -309,7 +309,7 @@ def test_multiword_entities():
         example = td.entity_examples[0]
         entities = example.get("entities")
         assert len(entities) == 1
-        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT_ATTRIBUTE)
+        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT)
         start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens)
         assert start == 4
         assert end == 7
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 19633b5b9ab4..c555a28f1215 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -7,10 +7,10 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     SPARSE_FEATURE_NAMES,
     DENSE_FEATURE_NAMES,
-    INTENT_ATTRIBUTE,
+    INTENT,
 )
 from rasa.utils.tensorflow.constants import LOSS_TYPE, RANDOM_SEED, RANKING_LENGTH
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -47,15 +47,15 @@ def test_compute_default_label_features():
                 Message(
                     "test a",
                     data={
-                        SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
                     },
                 ),
                 Message(
                     "test b",
                     data={
-                        SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
                     },
                 ),
             ],
@@ -66,8 +66,8 @@ def test_compute_default_label_features():
                 Message(
                     "test a",
                     data={
-                        SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[INTENT]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[INTENT]: np.zeros(1),
                     },
                 )
             ],
@@ -76,7 +76,7 @@ def test_compute_default_label_features():
     ],
 )
 def test_check_labels_features_exist(messages, expected):
-    attribute = TEXT_ATTRIBUTE
+    attribute = TEXT
 
     assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected
 
diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index be956e6c5c17..98a74b6088e1 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -1,4 +1,4 @@
-from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import TEXT, SPACY_DOCS, ENTITIES
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
@@ -49,12 +49,12 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
     extractor.train(training_data)
 
     sentence = "italian restaurant"
-    message = Message(sentence, {SPACY_DOCS[TEXT_ATTRIBUTE]: spacy_nlp(sentence)})
+    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})
 
     tokenizer.process(message)
     extractor.process(message)
 
-    detected_entities = message.get(ENTITIES_ATTRIBUTE)
+    detected_entities = message.get(ENTITIES)
 
     assert len(detected_entities) == 1
     assert detected_entities[0]["entity"] == "cuisine"
diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
index a16bf6597156..c9dc0f47b3ba 100644
--- a/tests/nlu/featurizers/test_convert_featurizer.py
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -5,11 +5,11 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    RESPONSE,
+    INTENT,
 )
 from rasa.nlu.training_data import Message
 from rasa.nlu.config import RasaNLUModelConfig
@@ -21,9 +21,9 @@ def test_convert_featurizer_process():
 
     sentence = "Hey how are you today ?"
     message = Message(sentence)
-    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
-    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
-    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
+    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
+    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
+    message.set(TOKENS_NAMES[TEXT], tokens)
 
     featurizer.process(message)
 
@@ -32,7 +32,7 @@ def test_convert_featurizer_process():
         [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
     )
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
@@ -44,11 +44,11 @@ def test_convert_featurizer_train():
 
     sentence = "Hey how are you today ?"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
-    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
-    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
-    message.set(TOKENS_NAMES[RESPONSE_ATTRIBUTE], tokens)
+    message.set(RESPONSE, sentence)
+    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
+    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
+    message.set(TOKENS_NAMES[TEXT], tokens)
+    message.set(TOKENS_NAMES[RESPONSE], tokens)
 
     featurizer.train(TrainingData([message]), RasaNLUModelConfig())
 
@@ -57,19 +57,19 @@ def test_convert_featurizer_train():
         [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
     )
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
 
@@ -85,7 +85,7 @@ def test_convert_featurizer_train():
     ],
 )
 def test_convert_featurizer_tokens_to_text(sentence, expected_text):
-    tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT_ATTRIBUTE)
+    tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT)
 
     actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index 581eefc77648..d4a8c99113e2 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -7,10 +7,10 @@
 from rasa.nlu.constants import (
     CLS_TOKEN,
     TOKENS_NAMES,
-    TEXT_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    TEXT,
+    INTENT,
     SPARSE_FEATURE_NAMES,
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
 )
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
@@ -43,10 +43,10 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
     ftr.process(test_message)
 
     assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
     )
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
 
     assert np.all(actual[0] == expected)
     assert np.all(actual[-1] == expected_cls)
@@ -68,8 +68,8 @@ def test_count_vector_featurizer_attribute_featurization(
 
     train_message = Message(sentence)
     # this is needed for a valid training example
-    train_message.set(INTENT_ATTRIBUTE, intent)
-    train_message.set(RESPONSE_ATTRIBUTE, response)
+    train_message.set(INTENT, intent)
+    train_message.set(RESPONSE, response)
 
     data = TrainingData([train_message])
 
@@ -78,19 +78,19 @@ def test_count_vector_featurizer_attribute_featurization(
 
     if intent_features:
         assert (
-            train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
+            train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0]
             == intent_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None
+        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None
 
     if response_features:
         assert (
-            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
             == response_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
 
 
 @pytest.mark.parametrize(
@@ -117,23 +117,21 @@ def test_count_vector_featurizer_shared_vocab(
 
     train_message = Message(sentence)
     # this is needed for a valid training example
-    train_message.set(INTENT_ATTRIBUTE, intent)
-    train_message.set(RESPONSE_ATTRIBUTE, response)
+    train_message.set(INTENT, intent)
+    train_message.set(RESPONSE, response)
 
     data = TrainingData([train_message])
     tk.train(data)
     ftr.train(data)
 
     assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0]
-        == text_features
+        train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features
     )
     assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
-        == intent_features
+        train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features
     )
     assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
         == response_features
     )
 
@@ -160,9 +158,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -192,9 +188,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -220,20 +214,18 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
     tokens_feature = [Token(i, 0) for i in tokens]
 
     train_message = Message("")
-    train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)
+    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)
 
     data = TrainingData([train_message])
 
     ftr.train(data)
 
     test_message = Message("")
-    test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)
+    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)
 
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -257,9 +249,7 @@ def test_count_vector_featurizer_char(sentence, expected):
     WhitespaceTokenizer().process(test_message)
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 def test_count_vector_featurizer_persist_load(tmpdir):
@@ -320,10 +310,10 @@ def test_count_vector_featurizer_persist_load(tmpdir):
     # check that train features and test features after loading are the same
     assert np.all(
         [
-            train_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
-            == test_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
-            train_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
-            == test_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
+            train_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+            == test_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(),
+            train_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+            == test_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(),
         ]
     )
 
@@ -334,8 +324,8 @@ def test_count_vectors_featurizer_train():
 
     sentence = "Hey how are you today ?"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
     WhitespaceTokenizer().train(TrainingData([message]))
 
     featurizer.train(TrainingData([message]), RasaNLUModelConfig())
@@ -343,19 +333,19 @@ def test_count_vectors_featurizer_train():
     expected = np.array([0, 1, 0, 0, 0])
     expected_cls = np.array([1, 1, 1, 1, 1])
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])
 
     assert (6, 5) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])
 
     assert (6, 5) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])
 
     assert (1, 1) == vecs.shape
     assert np.all(vecs.toarray()[0] == np.array([1]))
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index ee6af4f197f6..61769ef24f5e 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -3,14 +3,14 @@
 import scipy.sparse
 
 from rasa.nlu.featurizers.featurizer import Featurizer, sequence_to_sentence_features
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT
 from rasa.nlu.training_data import Message
 
 
 def test_combine_with_existing_dense_features():
 
     featurizer = Featurizer({"return_sequence": False})
-    attribute = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    attribute = DENSE_FEATURE_NAMES[TEXT]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
     new_features = [[1, 0], [0, 1]]
@@ -28,7 +28,7 @@ def test_combine_with_existing_dense_features():
 
 def test_combine_with_existing_dense_features_shape_mismatch():
     featurizer = Featurizer({"return_sequence": False})
-    attribute = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    attribute = DENSE_FEATURE_NAMES[TEXT]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
     new_features = [[0, 1]]
@@ -45,7 +45,7 @@ def test_combine_with_existing_dense_features_shape_mismatch():
 def test_combine_with_existing_sparse_features():
 
     featurizer = Featurizer({"return_sequence": False})
-    attribute = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    attribute = SPARSE_FEATURE_NAMES[TEXT]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
     new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]])
@@ -65,7 +65,7 @@ def test_combine_with_existing_sparse_features():
 def test_combine_with_existing_sparse_features_shape_mismatch():
 
     featurizer = Featurizer({"return_sequence": False})
-    attribute = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    attribute = SPARSE_FEATURE_NAMES[TEXT]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
     new_features = scipy.sparse.csr_matrix([[0, 1]])
diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
index acd89825e61a..675b14bbda63 100644
--- a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
+++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -9,7 +9,7 @@
     LexicalSyntacticFeaturizer,
 )
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, SPARSE_FEATURE_NAMES, SPACY_DOCS
+from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, SPACY_DOCS
 from rasa.nlu.training_data import Message
 
 
@@ -57,10 +57,10 @@ def test_text_featurizer(sentence, expected_features):
     featurizer.process(test_message)
 
     assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
     )
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
 
     assert np.all(actual == expected_features)
 
@@ -91,10 +91,10 @@ def test_text_featurizer_window_size(sentence, expected, expected_cls):
     featurizer.process(test_message)
 
     assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
     )
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
 
     assert np.all(actual[0] == expected)
     assert np.all(actual[-1] == expected_cls)
@@ -121,8 +121,8 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
     train_message = Message(sentence)
     test_message = Message(sentence)
 
-    train_message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
-    test_message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
+    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
 
     SpacyTokenizer().process(train_message)
     SpacyTokenizer().process(test_message)
@@ -132,9 +132,9 @@ def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
     featurizer.process(test_message)
 
     assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
     )
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
 
     assert np.all(actual == expected)
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index 95fb1f7a83b0..6a2f1757a2f5 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -2,9 +2,9 @@
 
 from rasa.nlu.constants import (
     DENSE_FEATURE_NAMES,
-    TEXT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    TEXT,
+    RESPONSE,
+    INTENT,
     TOKENS_NAMES,
 )
 from rasa.nlu.training_data import Message, TrainingData
@@ -20,7 +20,7 @@ def test_mitie_featurizer(mitie_feature_extractor):
     sentence = "Hey how are you today"
     message = Message(sentence)
     MitieTokenizer().process(message)
-    tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
+    tokens = message.get(TOKENS_NAMES[TEXT])
 
     vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)
 
@@ -40,8 +40,8 @@ def test_mitie_featurizer_train(mitie_feature_extractor):
 
     sentence = "Hey how are you today"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
     MitieTokenizer().train(TrainingData([message]))
 
     featurizer.train(
@@ -55,18 +55,18 @@ def test_mitie_featurizer_train(mitie_feature_extractor):
     )
     expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
-    assert len(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])) == len(vecs)
+    assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
 
-    assert len(message.get(TOKENS_NAMES[RESPONSE_ATTRIBUTE])) == len(vecs)
+    assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index dcc9b80e107d..d695174339cf 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -6,11 +6,11 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    TEXT,
+    RESPONSE,
     SPACY_DOCS,
     TOKENS_NAMES,
-    INTENT_ATTRIBUTE,
+    INTENT,
     SPARSE_FEATURE_NAMES,
 )
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -80,17 +80,17 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
 
     # adds tokens to the message
     tokenizer = SpacyTokenizer({})
-    message = Message(sentence, data={RESPONSE_ATTRIBUTE: sentence})
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    message = Message(sentence, data={RESPONSE: sentence})
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
-    assert len(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])) > 0
+    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
     # the number of regex matches on each token should match
-    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])):
+    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
         token_matches = token.get("pattern").values()
         num_matches = sum(token_matches)
         assert num_matches == labeled_tokens.count(i)
@@ -144,13 +144,13 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
-    assert len(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])) > 0
+    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
     # the number of regex matches on each token should match
-    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])):
+    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
         token_matches = token.get("pattern").values()
         num_matches = sum(token_matches)
         assert num_matches == labeled_tokens.count(i)
@@ -177,10 +177,10 @@ def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nl
     # adds tokens to the message
     tokenizer = SpacyTokenizer()
     message = Message(sentence)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT)
     assert np.allclose(result.toarray()[0], expected, atol=1e-10)
     assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
 
@@ -197,8 +197,8 @@ def test_regex_featurizer_train():
 
     sentence = "hey how are you today 19.12.2019 ?"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
     WhitespaceTokenizer().train(TrainingData([message]))
 
     featurizer.train(
@@ -208,18 +208,18 @@ def test_regex_featurizer_train():
     expected = np.array([0, 1, 0])
     expected_cls = np.array([1, 1, 1])
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])
 
     assert (7, 3) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])
 
     assert (7, 3) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index e13acd4a0312..9bbce2559a48 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -8,10 +8,10 @@
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 from rasa.nlu.constants import (
     SPACY_DOCS,
-    TEXT_ATTRIBUTE,
+    TEXT,
     DENSE_FEATURE_NAMES,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    RESPONSE,
+    INTENT,
 )
 
 
@@ -20,11 +20,11 @@ def test_spacy_featurizer_cls_vector(spacy_nlp):
 
     sentence = "Hey how are you today"
     message = Message(sentence)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
 
     featurizer._set_spacy_features(message)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
     expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])
@@ -146,28 +146,28 @@ def test_spacy_featurizer_train(spacy_nlp):
 
     sentence = "Hey how are you today"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
-    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence))
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
+    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence))
 
     featurizer.train(TrainingData([message]), RasaNLUModelConfig())
 
     expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
     expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     assert 6 == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
 
     assert 6 == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
diff --git a/tests/nlu/tokenizers/test_convert_tokenizer.py b/tests/nlu/tokenizers/test_convert_tokenizer.py
index 30c052c3e781..06cd3a9e7bad 100644
--- a/tests/nlu/tokenizers/test_convert_tokenizer.py
+++ b/tests/nlu/tokenizers/test_convert_tokenizer.py
@@ -1,7 +1,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 
 
@@ -27,7 +27,7 @@
 def test_convert_tokenizer_edge_cases(text, expected_tokens, expected_indices):
     tk = ConveRTTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -47,10 +47,8 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = ConveRTTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py
index 428e4b7cd350..426215541587 100644
--- a/tests/nlu/tokenizers/test_jieba_tokenizer.py
+++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py
@@ -5,7 +5,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 
 
 @pytest.mark.parametrize(
@@ -26,7 +26,7 @@
 def test_jieba(text, expected_tokens, expected_indices):
     tk = JiebaTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -42,7 +42,7 @@ def test_jieba_load_dictionary(tmpdir_factory):
         JiebaTokenizer, "load_custom_dictionary", return_value=None
     ) as mock_method:
         tk = JiebaTokenizer(component_config)
-        tk.tokenize(Message(""), attribute=TEXT_ATTRIBUTE)
+        tk.tokenize(Message(""), attribute=TEXT)
 
     mock_method.assert_called_once_with(dictionary_path)
 
@@ -60,10 +60,8 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = JiebaTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_mitie_tokenizer.py b/tests/nlu/tokenizers/test_mitie_tokenizer.py
index 647290990b48..ebf40b0f9415 100644
--- a/tests/nlu/tokenizers/test_mitie_tokenizer.py
+++ b/tests/nlu/tokenizers/test_mitie_tokenizer.py
@@ -1,7 +1,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 
 
@@ -23,7 +23,7 @@
 def test_mitie(text, expected_tokens, expected_indices):
     tk = MitieTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -43,10 +43,8 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = MitieTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
index 1a7c689798a1..0a3f7ecf42f7 100644
--- a/tests/nlu/tokenizers/test_spacy_tokenizer.py
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -4,10 +4,10 @@
 from rasa.nlu.training_data import Message
 from rasa.nlu.constants import (
     CLS_TOKEN,
-    TEXT_ATTRIBUTE,
+    TEXT,
     SPACY_DOCS,
-    INTENT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    INTENT,
+    RESPONSE,
     TOKENS_NAMES,
 )
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -32,9 +32,9 @@ def test_spacy(text, expected_tokens, expected_indices, spacy_nlp):
     tk = SpacyTokenizer()
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
 
-    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(message, attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -52,9 +52,9 @@ def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp):
     tk = SpacyTokenizer()
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
 
-    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(message, attribute=TEXT)
 
     assert [t.data.get("pos") for t in tokens] == expected_pos_tags
 
@@ -73,16 +73,16 @@ def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp):
     tk = SpacyTokenizer()
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
-    message.set(RESPONSE_ATTRIBUTE, text)
-    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(text))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+    message.set(RESPONSE, text)
+    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text))
 
     training_data = TrainingData()
     training_data.training_examples = [message]
 
     tk.train(training_data)
 
-    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+    for attribute in [RESPONSE, TEXT]:
         tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute])
 
         assert [t.text for t in tokens] == expected_tokens
@@ -103,11 +103,9 @@ def test_custom_intent_symbol(text, expected_tokens, spacy_nlp):
     tk = SpacyTokenizer(component_config)
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py
index b69bc6af7adf..c526358466d1 100644
--- a/tests/nlu/tokenizers/test_tokenizer.py
+++ b/tests/nlu/tokenizers/test_tokenizer.py
@@ -2,9 +2,9 @@
 
 from rasa.nlu.constants import (
     CLS_TOKEN,
-    TEXT_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    TEXT,
+    INTENT,
+    RESPONSE,
     TOKENS_NAMES,
 )
 from rasa.nlu.training_data import Message, TrainingData
@@ -40,15 +40,15 @@ def test_train_tokenizer(text, expected_tokens, expected_indices):
     tk = WhitespaceTokenizer()
 
     message = Message(text)
-    message.set(RESPONSE_ATTRIBUTE, text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(RESPONSE, text)
+    message.set(INTENT, text)
 
     training_data = TrainingData()
     training_data.training_examples = [message]
 
     tk.train(training_data)
 
-    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+    for attribute in [RESPONSE, TEXT]:
         tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute])
 
         assert [t.text for t in tokens] == expected_tokens
@@ -56,7 +56,7 @@ def test_train_tokenizer(text, expected_tokens, expected_indices):
         assert [t.end for t in tokens] == [i[1] for i in expected_indices]
 
     # check intent attribute
-    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT_ATTRIBUTE])
+    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT])
 
     assert [t.text for t in tokens] == [text]
 
@@ -78,7 +78,7 @@ def test_process_tokenizer(text, expected_tokens, expected_indices):
 
     tk.process(message)
 
-    tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
+    tokens = message.get(TOKENS_NAMES[TEXT])
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -98,6 +98,6 @@ def test_split_intent(text, expected_tokens):
     tk = WhitespaceTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     assert [t.text for t in tk._split_intent(message)] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 5a7cff88ebaa..df459bb33659 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, INTENT_ATTRIBUTE
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, INTENT
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
@@ -34,7 +34,7 @@ def test_whitespace(text, expected_tokens, expected_indices):
 
     tk = WhitespaceTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -54,13 +54,11 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = WhitespaceTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
 
 
 @pytest.mark.parametrize(
@@ -77,7 +75,7 @@ def test_whitespace_with_case(text, component_config, expected_tokens):
 
     message = Message(text)
 
-    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(message, attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
 
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
index 9f4f7903bec3..a27c39672c06 100644
--- a/tests/nlu/utils/test_bilou_utils.py
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -1,7 +1,7 @@
 import pytest
 
 import rasa.nlu.utils.bilou_utils as bilou_utils
-from rasa.nlu.constants import BILOU_ENTITIES_ATTRIBUTE, ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import BILOU_ENTITIES, ENTITIES
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.training_data import TrainingData, Message
 
@@ -25,7 +25,7 @@ def test_entity_name_from_tag(tag, expected):
 def test_tags_to_ids():
     message = Message("Germany is part of the European Union")
     message.set(
-        BILOU_ENTITIES_ATTRIBUTE,
+        BILOU_ENTITIES,
         ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
     )
 
@@ -47,14 +47,12 @@ def test_remove_bilou_prefixes():
 def test_build_tag_id_dict():
     message_1 = Message("Germany is part of the European Union")
     message_1.set(
-        BILOU_ENTITIES_ATTRIBUTE,
+        BILOU_ENTITIES,
         ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
     )
 
     message_2 = Message("Berlin is the capital of Germany")
-    message_2.set(
-        BILOU_ENTITIES_ATTRIBUTE, ["U-location", "O", "O", "O", "O", "U-location"]
-    )
+    message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"])
 
     training_data = TrainingData([message_1, message_2])
 
@@ -78,7 +76,7 @@ def test_apply_bilou_schema():
 
     message_1 = Message("Germany is part of the European Union")
     message_1.set(
-        ENTITIES_ATTRIBUTE,
+        ENTITIES,
         [
             {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
             {
@@ -92,7 +90,7 @@ def test_apply_bilou_schema():
 
     message_2 = Message("Berlin is the capital of Germany")
     message_2.set(
-        ENTITIES_ATTRIBUTE,
+        ENTITIES,
         [
             {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
             {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
@@ -105,7 +103,7 @@ def test_apply_bilou_schema():
 
     bilou_utils.apply_bilou_schema(training_data)
 
-    assert message_1.get(BILOU_ENTITIES_ATTRIBUTE) == [
+    assert message_1.get(BILOU_ENTITIES) == [
         "U-location",
         "O",
         "O",
@@ -115,7 +113,7 @@ def test_apply_bilou_schema():
         "L-organisation",
         "O",
     ]
-    assert message_2.get(BILOU_ENTITIES_ATTRIBUTE) == [
+    assert message_2.get(BILOU_ENTITIES) == [
         "U-location",
         "O",
         "O",

From fa5857d5ec6e07db5c31ff83bd80c74709129175 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 11 Feb 2020 10:05:06 +0100
Subject: [PATCH 335/633] update docs

---
 docs/core/policies.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index e2f59ef8f722..def75b9d9f22 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -192,6 +192,15 @@ In order to get reproducible training results for the same inputs you can
 set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 
 
+.. _embedding_policy:
+
+Embedding Policy
+^^^^^^^^^^^^^^^^
+
+``EmbeddingPolicy`` got renamed to ``TEDPolicy``.
+Please take a look at :ref:`ted_policy` for more details.
+
+
 .. _ted_policy:
 
 TED Policy

From bac3a9d0a08a7ad9745705778876ad4b99b77bbb Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 12:26:22 +0100
Subject: [PATCH 336/633] fix training on empty data

---
 rasa/core/policies/ted_policy.py        | 20 +++++++
 rasa/nlu/classifiers/diet_classifier.py | 71 +++++++++++++------------
 rasa/nlu/selectors/response_selector.py | 17 ++++++
 rasa/utils/tensorflow/tf_layers.py      |  9 +---
 rasa/utils/tensorflow/tf_model_data.py  |  3 ++
 5 files changed, 78 insertions(+), 42 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 225acaf91fae..e0bfeb8563af 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -266,6 +266,12 @@ def train(
 
         # extract actual training data to feed to model
         model_data = self._create_model_data(training_data.X, training_data.y)
+        if model_data.is_empty():
+            logger.error(
+                "Can not train TED policy. No data was provided. "
+                "Skipping training of the policy."
+            )
+            return
 
         # keep one example for persisting and loading
         self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
@@ -444,6 +450,8 @@ def __init__(
 
         # data
         self.data_signature = data_signature
+        self._check_data()
+
         self.predict_data_signature = {
             k: vs for k, vs in data_signature.items() if "dialogue" in k
         }
@@ -467,6 +475,18 @@ def __init__(
         self._tf_layers = {}
         self._prepare_layers()
 
+    def _check_data(self):
+        if "dialogue_features" not in self.data_signature:
+            raise ValueError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if "label_features" not in self.data_signature:
+            raise ValueError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+
     def _prepare_layers(self) -> None:
         self._tf_layers["loss.label"] = tf_layers.DotProductLoss(
             self.config[NUM_NEG],
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index e62e883db901..cf16bab38da9 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -573,14 +573,18 @@ def train(
         np.random.seed(self.component_config[RANDOM_SEED])
 
         model_data = self.preprocess_train_data(training_data)
+        if model_data.is_empty():
+            logger.error(
+                "Can not train DIET classifier. No data was provided. "
+                "Skipping training of the classifier."
+            )
+            return
 
         if self.component_config[INTENT_CLASSIFICATION]:
-            possible_to_train = self._check_enough_labels(model_data)
-
-            if not possible_to_train:
+            if not self._check_enough_labels(model_data):
                 logger.error(
                     "Can not train intent classifier. "
-                    "Need at least 2 different classes. "
+                    "Need at least 2 different intent classes. "
                     "Skipping training of classifier."
                 )
                 return
@@ -607,6 +611,10 @@ def train(
     # process helpers
     def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
         if self.model is None:
+            logger.error(
+                "There is no trained model: component is either not trained or "
+                "didn't receive enough training data."
+            )
             return
 
         # create session data from message and convert it into a batch of 1
@@ -615,21 +623,17 @@ def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
         return self.model.predict(model_data)
 
     def _predict_label(
-        self, out: Dict[Text, tf.Tensor]
+        self, predict_out: Optional[Dict[Text, tf.Tensor]]
     ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
         """Predicts the intent of the provided message."""
 
         label = {"name": None, "confidence": 0.0}
         label_ranking = []
 
-        if self.model is None:
-            logger.error(
-                "There is no trained model: component is either not trained or "
-                "didn't receive enough training data."
-            )
+        if predict_out is None:
             return label, label_ranking
 
-        message_sim = out["i_scores"].numpy()
+        message_sim = predict_out["i_scores"].numpy()
 
         message_sim = message_sim.flatten()  # sim is a matrix
 
@@ -671,17 +675,13 @@ def _predict_label(
         return label, label_ranking
 
     def _predict_entities(
-        self, out: Dict[Text, tf.Tensor], message: Message
+        self, predict_out: Optional[Dict[Text, tf.Tensor]], message: Message
     ) -> List[Dict]:
-        if self.model is None:
-            logger.error(
-                "There is no trained model: component is either not trained or "
-                "didn't receive enough training data"
-            )
+        if predict_out is None:
             return []
 
         # load tf graph and session
-        predictions = out["e_ids"].numpy()
+        predictions = predict_out["e_ids"].numpy()
 
         tags = [self.inverted_tag_dict[p] for p in predictions[0]]
 
@@ -902,6 +902,8 @@ def __init__(
 
         # data
         self.data_signature = data_signature
+        self._check_data()
+
         self.predict_data_signature = {
             k: vs for k, vs in data_signature.items() if "text" in k
         }
@@ -925,22 +927,27 @@ def __init__(
 
         self.all_labels_embed = None  # needed for efficient prediction
 
-        self._check_data()
-
     def _check_data(self):
         if "text_features" not in self.data_signature:
             raise ValueError(
-                "No text features specified. Cannot train 'DIETClassifier'."
-            )
-        if (
-            self.config[INTENT_CLASSIFICATION]
-            and "label_features" not in self.data_signature
-        ):
-            raise ValueError(
-                "No label features specified. Cannot train 'DIETClassifier'."
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
             )
+        if self.config[INTENT_CLASSIFICATION]:
+            if "label_features" not in self.data_signature:
+                raise ValueError(
+                    f"No label features specified. "
+                    f"Cannot train '{self.__class__.__name__}' model."
+                )
+            if self.config[SHARE_HIDDEN_LAYERS] and self.data_signature["text_features"] != self.data_signature["label_features"]:
+                raise ValueError(
+                    "If hidden layer weights are shared, data signatures "
+                    "for text_features and label_features must coincide."
+                )
+
         if self.config[ENTITY_RECOGNITION] and "tag_ids" not in self.data_signature:
-            raise ValueError("No tag ids present. Cannot train 'DIETClassifier'.")
+            raise ValueError(f"No tag ids present. "
+                             f"Cannot train '{self.__class__.__name__}' model.")
 
     def _create_metrics(self):
         # self.metrics preserve order
@@ -1004,12 +1011,6 @@ def _prepare_sparse_dense_layers(
                 )
 
     def _prepare_input_layers(self, name: Text) -> None:
-        if f"{name}_features" not in self.data_signature:
-            raise KeyError(
-                f"Features for '{name}' are not present "
-                f"in data signature: {self.data_signature}."
-            )
-
         self._tf_layers[f"sparse_dropout.{name}"] = tf_layers.SparseDropout(
             rate=self.config[DROPRATE]
         )
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index e54232d726c8..5b3968b982bf 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -261,6 +261,23 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
 
 class DIET2DIET(DIET):
+    def _check_data(self):
+        if "text_features" not in self.data_signature:
+            raise ValueError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if "label_features" not in self.data_signature:
+            raise ValueError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if self.config[SHARE_HIDDEN_LAYERS] and self.data_signature["text_features"] != self.data_signature["label_features"]:
+            raise ValueError(
+                "If hidden layer weights are shared, data signatures "
+                "for text_features and label_features must coincide."
+            )
+
     def _create_metrics(self):
         # self.metrics preserve order
         # output losses first
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index c157e32d1100..2d6dcabccd18 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -397,11 +397,8 @@ def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Ten
 
 class InputMask(tf.keras.layers.Layer):
     def build(self, input_shape: tf.TensorShape) -> None:
-        initializer = tf.keras.initializers.GlorotUniform()
         self.mask_vector = self.add_weight(
             shape=(1, 1, input_shape[-1]),
-            initializer=initializer,
-            trainable=True,
             name="mask_vector",
         )
         self.built = True
@@ -451,12 +448,10 @@ class CRF(tf.keras.layers.Layer):
     def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
         super().__init__(name=name)
 
-        initializer = tf.keras.initializers.GlorotUniform()
-        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        regularizer = tf.keras.regularizers.l2(reg_lambda)
         self.transition_params = self.add_weight(
             shape=(num_tags, num_tags),
-            initializer=initializer,
-            regularizer=l2_regularizer,
+            regularizer=regularizer,
             trainable=True,
             name="transitions",
         )
diff --git a/rasa/utils/tensorflow/tf_model_data.py b/rasa/utils/tensorflow/tf_model_data.py
index 58949d6508b7..b4918a16b9c3 100644
--- a/rasa/utils/tensorflow/tf_model_data.py
+++ b/rasa/utils/tensorflow/tf_model_data.py
@@ -45,6 +45,9 @@ def keys(self):
     def feature_not_exists(self, key: Text) -> bool:
         return key not in self.data or not self.data[key]
 
+    def is_empty(self):
+        return not self.data
+
     def get_number_of_examples(self) -> int:
         """Obtain number of examples in data.
 

From 8bce2645816b54bb2b033897f4e8d0382fccad42 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 12:28:00 +0100
Subject: [PATCH 337/633] black

---
 rasa/nlu/classifiers/diet_classifier.py | 12 +++++++++---
 rasa/nlu/selectors/response_selector.py |  6 +++++-
 rasa/utils/tensorflow/tf_layers.py      |  3 +--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index cf16bab38da9..ddc8fc71d37c 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -939,15 +939,21 @@ def _check_data(self):
                     f"No label features specified. "
                     f"Cannot train '{self.__class__.__name__}' model."
                 )
-            if self.config[SHARE_HIDDEN_LAYERS] and self.data_signature["text_features"] != self.data_signature["label_features"]:
+            if (
+                self.config[SHARE_HIDDEN_LAYERS]
+                and self.data_signature["text_features"]
+                != self.data_signature["label_features"]
+            ):
                 raise ValueError(
                     "If hidden layer weights are shared, data signatures "
                     "for text_features and label_features must coincide."
                 )
 
         if self.config[ENTITY_RECOGNITION] and "tag_ids" not in self.data_signature:
-            raise ValueError(f"No tag ids present. "
-                             f"Cannot train '{self.__class__.__name__}' model.")
+            raise ValueError(
+                f"No tag ids present. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
 
     def _create_metrics(self):
         # self.metrics preserve order
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 5b3968b982bf..3102d2ff22bd 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -272,7 +272,11 @@ def _check_data(self):
                 f"No label features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
-        if self.config[SHARE_HIDDEN_LAYERS] and self.data_signature["text_features"] != self.data_signature["label_features"]:
+        if (
+            self.config[SHARE_HIDDEN_LAYERS]
+            and self.data_signature["text_features"]
+            != self.data_signature["label_features"]
+        ):
             raise ValueError(
                 "If hidden layer weights are shared, data signatures "
                 "for text_features and label_features must coincide."
diff --git a/rasa/utils/tensorflow/tf_layers.py b/rasa/utils/tensorflow/tf_layers.py
index 2d6dcabccd18..64f147bdfe9d 100644
--- a/rasa/utils/tensorflow/tf_layers.py
+++ b/rasa/utils/tensorflow/tf_layers.py
@@ -398,8 +398,7 @@ def call(self, x: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor) -> tf.Ten
 class InputMask(tf.keras.layers.Layer):
     def build(self, input_shape: tf.TensorShape) -> None:
         self.mask_vector = self.add_weight(
-            shape=(1, 1, input_shape[-1]),
-            name="mask_vector",
+            shape=(1, 1, input_shape[-1]), name="mask_vector",
         )
         self.built = True
 

From e3dad46fdc78af901a57f74c7d701215daef4be5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 12:34:17 +0100
Subject: [PATCH 338/633] set config first

---
 rasa/core/policies/ted_policy.py        | 4 ++--
 rasa/nlu/classifiers/diet_classifier.py | 9 ++++-----
 rasa/nlu/selectors/response_selector.py | 4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e0bfeb8563af..9e42ed0c7309 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -442,7 +442,7 @@ def __init__(
         config: Dict[Text, Any],
         max_history_tracker_featurizer_used: bool,
         label_data: RasaModelData,
-    ):
+    ) -> None:
         super().__init__(name="TED", random_seed=config[RANDOM_SEED])
 
         self.config = config
@@ -475,7 +475,7 @@ def __init__(
         self._tf_layers = {}
         self._prepare_layers()
 
-    def _check_data(self):
+    def _check_data(self) -> None:
         if "dialogue_features" not in self.data_signature:
             raise ValueError(
                 f"No text features specified. "
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index ddc8fc71d37c..5993f4593fe2 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -900,7 +900,8 @@ def __init__(
     ) -> None:
         super().__init__(name="DIET", random_seed=config[RANDOM_SEED])
 
-        # data
+        self.config = config
+
         self.data_signature = data_signature
         self._check_data()
 
@@ -914,8 +915,6 @@ def __init__(
         )
         self._num_tags = len(inverted_tag_dict) if inverted_tag_dict is not None else 0
 
-        self.config = config
-
         # tf objects
         self._tf_layers = {}
         self._prepare_layers()
@@ -927,7 +926,7 @@ def __init__(
 
         self.all_labels_embed = None  # needed for efficient prediction
 
-    def _check_data(self):
+    def _check_data(self) -> None:
         if "text_features" not in self.data_signature:
             raise ValueError(
                 f"No text features specified. "
@@ -955,7 +954,7 @@ def _check_data(self):
                 f"Cannot train '{self.__class__.__name__}' model."
             )
 
-    def _create_metrics(self):
+    def _create_metrics(self) -> None:
         # self.metrics preserve order
         # output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 3102d2ff22bd..b9a80dc1d01e 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -261,7 +261,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
 
 class DIET2DIET(DIET):
-    def _check_data(self):
+    def _check_data(self) -> None:
         if "text_features" not in self.data_signature:
             raise ValueError(
                 f"No text features specified. "
@@ -282,7 +282,7 @@ def _check_data(self):
                 "for text_features and label_features must coincide."
             )
 
-    def _create_metrics(self):
+    def _create_metrics(self) -> None:
         # self.metrics preserve order
         # output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")

From 3cc029fd9f0882fe3307a1e4d3a293125edbb715 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 13:07:15 +0100
Subject: [PATCH 339/633] fix relative attn

---
 rasa/utils/tensorflow/transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 5e3417f0a916..6776023eec03 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -106,9 +106,9 @@ def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor
 
     def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
         if self.unidirectional:
-            # pad the right side to length
+            # pad the right side to relative_length
             pad_right = tf.zeros_like(x[:, :, :, -1:, :])
-            pad_right = tf.tile(pad_right, (1, 1, 1, length - 1, 1))
+            pad_right = tf.tile(pad_right, (1, 1, 1, self.relative_length - 1, 1))
             x = tf.concat([x, pad_right], axis=-2)
 
         dl = self.relative_length - length

From e4b5395d0821b88bd4d741e9c391687fb8c87787 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 11 Feb 2020 14:42:33 +0100
Subject: [PATCH 340/633] update components.rst

---
 docs/nlu/components.rst                       | 458 ++++++++++++------
 docs/user-guide/installation.rst              |   1 +
 .../dense_featurizer/mitie_featurizer.py      |  11 +-
 rasa/utils/tensorflow/constants.py            |   2 +-
 4 files changed, 325 insertions(+), 147 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c5f6c2a8a060..3f853c5f065b 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -57,15 +57,15 @@ MitieNLP
 SpacyNLP
 ~~~~~~~~
 
-:Short: spacy language initializer
+:Short: spaCy language initializer
 :Outputs: nothing
 :Requires: nothing
 :Description:
-    Initializes spacy structures. Every spacy component relies on this, hence this should be put at the beginning
-    of every pipeline that uses any spacy components.
+    Initializes spacy structures. Every spaCy component relies on this, hence this should be put at the beginning
+    of every pipeline that uses any spaCy components.
 :Configuration:
     Language model, default will use the configured language.
-    If the spacy model to be used has a name that is different from the language tag (``"en"``, ``"de"``, etc.),
+    If the spaCy model to be used has a name that is different from the language tag (``"en"``, ``"de"``, etc.),
     the model name can be specified using this configuration variable. The name will be passed to ``spacy.load(name)``.
 
     .. code-block:: yaml
@@ -82,6 +82,139 @@ SpacyNLP
           # between these two words, therefore setting this to `true`.
           case_sensitive: false
 
+    For more information on how to obtain the spaCy models, head over to
+    :ref:`installing SpaCy <install-spacy>`.
+
+.. _tokenizers:
+
+Tokenizers
+----------
+
+Tokenizers split text into tokens.
+If you want to split intents into multiple labels, e.g. for predicting multiple intents or for
+modeling hierarchical intent structure, use these flags with any tokenizer:
+
+- ``intent_tokenization_flag`` indicates whether to tokenize intent labels or not. By default this flag is set to
+  ``False``, intent will not be tokenized.
+- ``intent_split_symbol`` sets the delimiter string to split the intent labels, default is underscore
+  (``_``).
+
+    .. note:: All tokenizer add an additional token ``__CLS__`` to the end of the list of tokens when tokenizing
+              text and responses.
+
+WhitespaceTokenizer
+~~~~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer using whitespaces as a separator
+:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Requires: nothing
+:Description:
+    Creates a token for every whitespace separated character sequence.
+:Configuration:
+    Make the tokenizer not case sensitive by adding the ``case_sensitive: False`` option.
+    Default being ``case_sensitive: True``.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "WhitespaceTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+          # Text will be tokenized with case sensitive as default
+          "case_sensitive": True
+
+
+JiebaTokenizer
+~~~~~~~~~~~~~~
+
+:Short: Tokenizer using Jieba for Chinese language
+:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Requires: nothing
+:Description:
+    Creates tokens using the Jieba tokenizer specifically for Chinese
+    language. For language other than Chinese, Jieba will work as
+    ``WhitespaceTokenizer``.
+
+    .. note::
+        To use ``JiebaTokenizer`` you need to install Jieba with ``pip install jieba``.
+
+:Configuration:
+    User's custom dictionary files can be auto loaded by specifying the files' directory path via ``dictionary_path``.
+    If the ``dictionary_path`` is ``None`` (the default), then no custom dictionary will be used.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "JiebaTokenizer"
+          dictionary_path: "path/to/custom/dictionary/dir"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+
+MitieTokenizer
+~~~~~~~~~~~~~~
+
+:Short: Tokenizer using MITIE
+:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Requires: :ref:`MitieNLP`
+:Description: Creates tokens using the MITIE tokenizer.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "MitieTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+SpacyTokenizer
+~~~~~~~~~~~~~~
+
+:Short: Tokenizer using spaCy
+:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Requires: :ref:`SpacyNLP`
+:Description:
+    Creates tokens using the spaCy tokenizer.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "SpacyTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+.. _ConveRTTokenizer:
+
+ConveRTTokenizer
+~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer using ConveRT
+:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Requires: nothing
+:Description:
+    Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
+:Configuration:
+    Make the tokenizer not case sensitive by adding the ``case_sensitive: False`` option.
+    Default being ``case_sensitive: True``.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "SpacyTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
 Text Featurizers
 ----------------
 
@@ -101,17 +234,18 @@ The corresponding classifier can therefore decide what kind of features to use.
 MitieFeaturizer
 ~~~~~~~~~~~~~~~
 
-:Short: MITIE intent featurizer
-:Outputs: nothing, used as an input to intent classifiers that need intent features (e.g. ``SklearnIntentClassifier``)
+:Short:
+    Creates a vector representation of user message and response (if specified) using the spaCy featurizer.
+:Outputs: ``dense_features`` for texts and responses
 :Requires: :ref:`MitieNLP`
 :Type: Dense featurizer
 :Description:
-    Creates feature for intent classification using the MITIE featurizer.
+    Creates features for entity extraction, intent classification, and response classification using the MITIE
+    featurizer.
 
     .. note::
 
-        NOT used by the ``MitieIntentClassifier`` component. Currently, only ``SklearnIntentClassifier`` is able
-        to use precomputed features.
+        NOT used by the ``MitieIntentClassifier`` component.
 
 :Configuration:
     The sentence vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
@@ -123,19 +257,21 @@ MitieFeaturizer
         pipeline:
         - name: "MitieFeaturizer"
           # Specify what pooling operation should be used to calculate the vector of
-          # the CLS token. Available options: 'mean' and 'max'
+          # the CLS token. Available options: 'mean' and 'max'.
           "pooling": "mean"
 
 
 SpacyFeaturizer
 ~~~~~~~~~~~~~~~
 
-:Short: spacy intent featurizer
-:Outputs: nothing, used as an input to intent classifiers that need intent features (e.g. ``SklearnIntentClassifier``)
+:Short:
+    Creates a vector representation of user message and response (if specified) using the spaCy featurizer.
+:Outputs: ``dense_features`` for texts and responses
 :Requires: :ref:`SpacyNLP`
 :Type: Dense featurizer
 :Description:
-    Creates feature for intent classification using the spacy featurizer.
+    Creates features for entity extraction, intent classification, and response classification using the spaCy
+    featurizer.
 :Configuration:
     The sentence vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
     mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
@@ -146,7 +282,7 @@ SpacyFeaturizer
         pipeline:
         - name: "SpacyFeaturizer"
           # Specify what pooling operation should be used to calculate the vector of
-          # the CLS token. Available options: 'mean' and 'max'
+          # the CLS token. Available options: 'mean' and 'max'.
           "pooling": "mean"
 
 
@@ -156,13 +292,11 @@ ConveRTFeaturizer
 :Short:
     Creates a vector representation of user message and response (if specified) using
     `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model.
-:Outputs:
-    nothing, used as an input to intent classifiers and response selectors that need intent features and response
-    features respectively (e.g. ``DIETClassifier`` and ``ResponseSelector``)
+:Outputs: ``dense_features`` for texts and responses
 :Requires: :ref:`ConveRTTokenizer`
 :Type: Dense featurizer
 :Description:
-    Creates features for intent classification and response selection.
+    Creates features for entity extraction, intent classification, and response selection.
     Uses the `default signature <https://github.com/PolyAI-LDN/polyai-models#tfhub-signatures>`_ to compute vector
     representations of input text.
 
@@ -185,35 +319,36 @@ ConveRTFeaturizer
 RegexFeaturizer
 ~~~~~~~~~~~~~~~
 
-:Short: regex feature creation to support intent and entity classification
-:Outputs: ``text_features`` and ``tokens.pattern``
-:Requires: nothing
+:Short: Creates a vector representation of user message using regular expressions.
+:Outputs: ``sparse_features`` for texts and ``tokens.pattern``
+:Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
     Creates features for entity extraction and intent classification.
-    During training, the regex intent featurizer creates a list of `regular expressions` defined in the training
+    During training ``RegexFeaturizer`` creates a list of `regular expressions` defined in the training
     data format.
     For each regex, a feature will be set marking whether this expression was found in the input, which will later
     be fed into intent classifier / entity extractor to simplify classification (assuming the classifier has learned
-    during the training phase, that this set feature indicates a certain intent).
+    during the training phase, that this set feature indicates a certain intent / entity).
     Regex features for entity extraction are currently only supported by the ``CRFEntityExtractor`` component!
 
-    .. note:: There needs to be a tokenizer previous to this featurizer in the pipeline!
+:Configuration:
 
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "RegexFeaturizer"
 
 CountVectorsFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Creates bag-of-words representation of user message and label (intent and response) features
-:Outputs:
-   nothing, used as an input to intent classifiers that
-   need bag-of-words representation of intent features
-   (e.g. ``DIETClassifier``)
-:Requires: nothing
+:Short: Creates bag-of-words representation of user messages, intents, and responses.
+:Outputs: ``sparse_features`` for texts, intents, and responses
+:Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
     Creates features for intent classification and response selection.
-    Creates bag-of-words representation of user message and label features using
+    Creates bag-of-words representation of user message, intent, and response using
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
     All tokens which consist only of digits (e.g. 123 and 99 but not a123d) will be assigned to the same feature.
 
@@ -250,7 +385,7 @@ CountVectorsFeaturizer
         In this case during prediction all unknown words will be treated as this generic word ``OOV_token``.
 
         For example, one might create separate intent ``outofscope`` in the training data containing messages of
-        different number of ``OOV_token`` s and maybe some additional general words.
+        different number of ``OOV_token``s and maybe some additional general words.
         Then an algorithm will likely classify a message with unknown words as this intent ``outofscope``.
 
         .. note::
@@ -319,9 +454,8 @@ CountVectorsFeaturizer
 LexicalSyntacticFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Lexical and syntactic feature creation to support entity extraction.
-:Outputs:
-   ``text_sparse_features``
+:Short: Creates lexical and syntactic features for user message to support entity extraction.
+:Outputs: ``sparse_features`` for texts
 :Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
@@ -339,7 +473,8 @@ LexicalSyntacticFeaturizer
     EOS             Checks if the token is at the end of the sentence.
     low             Checks if the token is lower case.
     upper           Checks if the token is upper case.
-    title           Checks if the token starts with an uppercase character and all remaining characters are lowercased.
+    title           Checks if the token starts with an uppercase character and all remaining characters are
+                    lowercased.
     digit           Checks if the token contains just digits.
     prefix5         Take the first five characters of the token.
     prefix2         Take the first two characters of the token.
@@ -352,7 +487,7 @@ LexicalSyntacticFeaturizer
     ==============  =============================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
-    previous words, the current word in the sliding window, and the next words.
+    previous words, the current word, and the next words in the sliding window.
     You define the features as [before, token, after] array.
     If you, for example, want to define features for the token before, the current token, and the token after,
     your features configuration could look like this:
@@ -387,6 +522,7 @@ LexicalSyntacticFeaturizer
 Intent Classifiers
 ------------------
 
+Intent classifiers assign one of the intents defined in the domain file to incoming user messages.
 
 MitieIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~
@@ -395,7 +531,7 @@ MitieIntentClassifier
     MITIE intent classifier (using a
     `text categorizer <https://github.com/mit-nlp/MITIE/blob/master/examples/python/text_categorizer_pure_model.py>`_)
 :Outputs: ``intent``
-:Requires: A tokenizer and a featurizer
+:Requires: ``tokens`` for user message
 :Output-Example:
 
     .. code-block:: json
@@ -419,9 +555,9 @@ MitieIntentClassifier
 SklearnIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: sklearn intent classifier
+:Short: Sklearn intent classifier
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: A featurizer
+:Requires: ``dense_features`` for user message
 :Output-Example:
 
     .. code-block:: json
@@ -442,14 +578,14 @@ SklearnIntentClassifier
 
 :Description:
     The sklearn intent classifier trains a linear SVM which gets optimized using a grid search. In addition
-    to other classifiers it also provides rankings of the labels that did not "win". The spacy intent classifier
-    needs to be preceded by a featurizer in the pipeline. This featurizer creates the features used for the
-    classification.
+    to other classifiers it also provides rankings of the labels that did not "win". The ``SklearnIntentClassifier``
+    needs to be preceded by a dense featurizer in the pipeline. This dense featurizer creates the features used for
+    the classification.
 
 :Configuration:
     During the training of the SVM a hyperparameter search is run to
     find the best parameter set. In the config, you can specify the parameters
-    that will get tried
+    that will get tried.
 
     .. code-block:: yaml
 
@@ -463,12 +599,146 @@ SklearnIntentClassifier
           # This is used with the ``C`` hyperparameter in GridSearchCV.
           kernels: ["linear"]
 
+EmbeddingIntentClassifier
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:Short: Dual Intent Entity Transformer used for intent classification
+:Outputs: ``intent`` and ``intent_ranking``
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
+:Output-Example:
+
+    .. code-block:: json
+
+        {
+            "intent": {"name": "greet", "confidence": 0.78343},
+            "intent_ranking": [
+                {
+                    "confidence": 0.1485910906220309,
+                    "name": "goodbye"
+                },
+                {
+                    "confidence": 0.08161531595656784,
+                    "name": "restaurant_search"
+                }
+            ]
+        }
+
+:Description:
+    The ``EmbeddingIntentClassifier`` embeds user inputs and intent labels into the same space.
+    Supervised embeddings are trained by maximizing similarity between them.
+    This algorithm is based on `StarSpace <https://arxiv.org/abs/1709.03856>`_.
+    However, in this implementation the loss function is slightly different and
+    additional hidden layers are added together with dropout.
+    This algorithm also provides similarity rankings of the labels that did not "win".
+
+    The ``EmbeddingIntentClassifier`` needs to be preceded by a featurizer in the pipeline.
+    This featurizer creates the features used for the embeddings.
+    It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
+    by ``SpacyNLP`` and ``SpacyTokenizer``.
+
+    .. note:: If during prediction time a message contains **only** words unseen during training,
+              and no Out-Of-Vacabulary preprocessor was used,
+              empty intent ``None`` is predicted with confidence ``0.0``.
+
+    .. warning::
+        ``EmbeddingIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
+        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
+
+
+:Configuration:
+
+    The algorithm has hyperparameters to control:
+
+        - neural network's architecture:
+
+            - ``hidden_layers_sizes_text`` sets a list of hidden layer sizes before
+              the embedding layer for user inputs, the number of hidden layers
+              is equal to the length of the list
+            - ``hidden_layers_sizes_label`` sets a list of hidden layer sizes before
+              the embedding layer for intent labels, the number of hidden layers
+              is equal to the length of the list
+            - ``share_hidden_layers`` if set to True, shares the hidden layers between user inputs and intent label
+
+        - training:
+
+            - ``batch_size`` sets the number of training examples in one
+              forward/backward pass, the higher the batch size, the more
+              memory space you'll need;
+            - ``batch_strategy`` sets the type of batching strategy,
+              it should be either ``sequence`` or ``balanced``;
+            - ``epochs`` sets the number of times the algorithm will see
+              training data, where one ``epoch`` equals one forward pass and
+              one backward pass of all the training examples;
+            - ``random_seed`` if set to any int will get reproducible
+              training results for the same inputs;
+            - ``learning_rate`` to set the learning rate of the optimizer
+
+        - embedding:
+
+            - ``dense_dimension`` sets the dimensions
+            - ``embedding_dimension`` sets the dimension of embedding space;
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels,
+              the algorithm will minimize their similarity to the user
+              input during training;
+            - ``similarity_type`` sets the type of the similarity,
+              it should be either ``auto``, ``cosine`` or ``inner``,
+              if ``auto``, it will be set depending on ``loss_type``,
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+            - ``loss_type`` sets the type of the loss function,
+              it should be either ``softmax`` or ``margin``;
+            - ``ranking_length`` defines the number of top confidences over
+              which to normalize ranking results if ``loss_type: "softmax"``;
+              to turn off normalization set it to 0
+            - ``maximum_positive_similarity`` controls how similar the algorithm should try
+              to make embedding vectors for correct intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``maximum_negative_similarity`` controls maximum negative similarity for
+              incorrect intents,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
+              minimizes maximum similarity over incorrect intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``scale_loss`` if ``true`` the algorithm will downscale the loss
+              for examples where correct label is predicted with high confidence,
+              used only if ``loss_type`` is set to ``softmax``;
+
+        - regularization:
+
+            - ``l2_regularization`` sets the scale of L2 regularization
+            - ``C_emb`` sets the scale of how important is to minimize
+              the maximum similarity between embeddings of different intent labels;
+            - ``droprate`` sets the dropout rate, it should be
+              between ``0`` and ``1``, e.g. ``droprate=0.1``
+              would drop out ``10%`` of input units;
+            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not
+
+    .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
+
+    .. note:: There is an option to use linearly increasing batch size. The idea comes from
+              `<https://arxiv.org/abs/1711.00489>`_.
+              In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
+              If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
+
+    In the config, you can specify these parameters.
+    The default values are defined in ``EmbeddingIntentClassifier.defaults``:
+
+    .. literalinclude:: ../../rasa/nlu/classifiers/embedding_intent_classifier.py
+       :dedent: 4
+       :start-after: # default properties (DOC MARKER - don't remove)
+       :end-before: # end default properties (DOC MARKER - don't remove)
+
+    .. note:: Parameter ``maximum_negative_similarity`` is set to a negative value to mimic the original
+              starspace algorithm in the case ``maximum_negative_similarity = maximum_positive_similarity`` and
+              ``use_maximum_negative_similarity = False``.
+              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
+
+
 DIETClassifier
 ~~~~~~~~~~~~~~
 
 :Short: Dual Intent Entity Transformer used for intent classification and entity extraction
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: A featurizer
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
 :Output-Example:
 
     .. code-block:: json
@@ -488,14 +758,14 @@ DIETClassifier
         }
 
 :Description:
-    The embedding intent classifier embeds user inputs and intent labels into the same space.
+    The ``DIETClassifier`` embeds user inputs and intent labels into the same space.
     Supervised embeddings are trained by maximizing similarity between them.
     This algorithm is based on `StarSpace <https://arxiv.org/abs/1709.03856>`_.
     However, in this implementation the loss function is slightly different and
     additional hidden layers are added together with dropout.
     This algorithm also provides similarity rankings of the labels that did not "win".
 
-    The embedding intent classifier needs to be preceded by a featurizer in the pipeline.
+    The ``DIETClassifier`` needs to be preceded by a featurizer in the pipeline.
     This featurizer creates the features used for the embeddings.
     It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
     by ``SpacyNLP`` and ``SpacyTokenizer``.
@@ -690,100 +960,6 @@ Response Selector
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)
 
-.. _tokenizers:
-
-Tokenizers
-----------
-
-If you want to split intents into multiple labels, e.g. for predicting multiple intents or for
-modeling hierarchical intent structure, use these flags with any tokenizer:
-
-- ``intent_tokenization_flag`` indicates whether to tokenize intent labels or not. By default this flag is set to
-  ``False``, intent will not be tokenized.
-- ``intent_split_symbol`` sets the delimiter string to split the intent labels, default is underscore
-  (``_``).
-
-    .. note:: All tokenizer add an additional token ``__CLS__`` to the end of the list of tokens when tokenizing
-              text and responses.
-
-WhitespaceTokenizer
-~~~~~~~~~~~~~~~~~~~
-
-:Short: Tokenizer using whitespaces as a separator
-:Outputs: nothing
-:Requires: nothing
-:Description:
-    Creates a token for every whitespace separated character sequence. Can be used to define tokens for the MITIE entity
-    extractor.
-:Configuration:
-    Make the tokenizer not case sensitive by adding the ``case_sensitive: false`` option. Default being ``case_sensitive: true``.
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "WhitespaceTokenizer"
-          case_sensitive: false
-
-JiebaTokenizer
-~~~~~~~~~~~~~~
-
-:Short: Tokenizer using Jieba for Chinese language
-:Outputs: nothing
-:Requires: nothing
-:Description:
-    Creates tokens using the Jieba tokenizer specifically for Chinese
-    language. For language other than Chinese, Jieba will work as
-    ``WhitespaceTokenizer``. Can be used to define tokens for the
-    MITIE entity extractor. Make sure to install Jieba, ``pip install jieba``.
-:Configuration:
-    User's custom dictionary files can be auto loaded by specific the files' directory path via ``dictionary_path``
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "JiebaTokenizer"
-          dictionary_path: "path/to/custom/dictionary/dir"
-
-If the ``dictionary_path`` is ``None`` (the default), then no custom dictionary will be used.
-
-MitieTokenizer
-~~~~~~~~~~~~~~
-
-:Short: Tokenizer using MITIE
-:Outputs: nothing
-:Requires: :ref:`MitieNLP`
-:Description:
-    Creates tokens using the MITIE tokenizer. Can be used to define
-    tokens for the MITIE entity extractor.
-:Configuration:
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "MitieTokenizer"
-
-SpacyTokenizer
-~~~~~~~~~~~~~~
-
-:Short: Tokenizer using spacy
-:Outputs: nothing
-:Requires: :ref:`SpacyNLP`
-:Description:
-    Creates tokens using the spacy tokenizer. Can be used to define
-    tokens for the MITIE entity extractor.
-
-.. _ConveRTTokenizer:
-
-ConveRTTokenizer
-~~~~~~~~~~~~~~~~
-
-:Short: Tokenizer using ConveRT
-:Outputs: nothing
-:Requires: nothing
-:Description:
-    Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
-
-
 
 Entity Extractors
 -----------------
diff --git a/docs/user-guide/installation.rst b/docs/user-guide/installation.rst
index f13ec328a636..6afeaaaf5a7f 100644
--- a/docs/user-guide/installation.rst
+++ b/docs/user-guide/installation.rst
@@ -218,6 +218,7 @@ and sklearn_crfsuite get automatically installed. However, spaCy and MITIE need
 
         $ pip install -r alt_requirements/requirements_full.txt
 
+.. _install-spacy:
 
 Dependencies for spaCy
 ######################
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 6aeccac04625..f44573f90695 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -13,7 +13,6 @@
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
     TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
@@ -21,12 +20,14 @@
 
 class MitieFeaturizer(Featurizer):
 
-    provides = [DENSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
-    requires = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + [
-        "mitie_feature_extractor"
+    provides = [
+        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
+    requires = [
+        TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ] + ["mitie_feature_extractor"]
+
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
         # the CLS token. Available options: 'mean' and 'max'
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 1a6852ff8f02..6eb6a5a8bd81 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -12,7 +12,7 @@
 
 MAX_SEQ_LENGTH = "maximum_sequence_length"
 
-BATCH_SIZES = "batch_sizes"
+BATCH_SIZES = "batch_size"
 BATCH_STRATEGY = "batch_strategy"
 EPOCHS = "epochs"
 RANDOM_SEED = "random_seed"

From 120e86ed941ab2a269130370267fb072aa705236 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 15:25:16 +0100
Subject: [PATCH 341/633] add rel-attn defaults to EmbeddingPolicy

---
 Makefile                               |  8 ++++----
 rasa/core/policies/embedding_policy.py | 18 +++++++++++++++---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 6f2aefe7128e..c08226336b43 100644
--- a/Makefile
+++ b/Makefile
@@ -59,10 +59,10 @@ prepare-tests-ubuntu: prepare-tests-files
 	sudo apt-get -y install graphviz graphviz-dev python3-tk
 
 prepare-tests-files:
-	pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0 --no-cache-dir -q
-	python3 -m spacy link en_core_web_md en --force
-	pip3 install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz#egg=de_core_news_sm==2.1.0 --no-cache-dir -q
-	python3 -m spacy link de_core_news_sm de --force
+	sudo pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0 --no-cache-dir -q
+	sudo python3 -m spacy link en_core_web_md en --force
+	sudo pip3 install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz#egg=de_core_news_sm==2.1.0 --no-cache-dir -q
+	sudo python3 -m spacy link de_core_news_sm de --force
 	wget --progress=dot:giga -N -P data/ https://s3-eu-west-1.amazonaws.com/mitie/total_word_feature_extractor.dat
 
 test: clean
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index c780097dac3c..317d77208764 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -7,7 +7,10 @@
 from rasa.constants import DOCS_BASE_URL
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES_LABEL,
+    TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -26,11 +29,12 @@
     MU_POS,
     EMBED_DIM,
     HIDDEN_LAYERS_SIZES_DIALOGUE,
-    TRANSFORMER_SIZE,
-    MAX_SEQ_LENGTH,
-    NUM_HEADS,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
+    DROPRATE_ATTENTION,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
 )
 from rasa.utils.common import raise_warning
 from rasa.utils.tensorflow.models import RasaModel
@@ -103,11 +107,19 @@ class EmbeddingPolicy(TEDPolicy):
         DROPRATE_DIALOGUE: 0.1,
         # dropout rate for bot nn
         DROPRATE_LABEL: 0.0,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for hold out validation set
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
     }
     # end default properties (DOC MARKER - don't remove)
 

From f5f19a76d247b7081c151bd9dd437000a02854c0 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 15:26:05 +0100
Subject: [PATCH 342/633] remove changes to makefile

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index c08226336b43..6f2aefe7128e 100644
--- a/Makefile
+++ b/Makefile
@@ -59,10 +59,10 @@ prepare-tests-ubuntu: prepare-tests-files
 	sudo apt-get -y install graphviz graphviz-dev python3-tk
 
 prepare-tests-files:
-	sudo pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0 --no-cache-dir -q
-	sudo python3 -m spacy link en_core_web_md en --force
-	sudo pip3 install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz#egg=de_core_news_sm==2.1.0 --no-cache-dir -q
-	sudo python3 -m spacy link de_core_news_sm de --force
+	pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0 --no-cache-dir -q
+	python3 -m spacy link en_core_web_md en --force
+	pip3 install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz#egg=de_core_news_sm==2.1.0 --no-cache-dir -q
+	python3 -m spacy link de_core_news_sm de --force
 	wget --progress=dot:giga -N -P data/ https://s3-eu-west-1.amazonaws.com/mitie/total_word_feature_extractor.dat
 
 test: clean

From b67e9ccf0f259c8c0edb8cb46222b9f2fa5ac401 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 15:39:21 +0100
Subject: [PATCH 343/633] use the same weight initializer scheme as keras Dense
 leyer

---
 rasa/utils/tensorflow/layers.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 0579f70f4bbe..e81c16b1cc40 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -4,6 +4,7 @@
 import tensorflow_addons as tfa
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import initializers
 
 logger = logging.getLogger(__name__)
 
@@ -154,9 +155,15 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
 
 
 class InputMask(tf.keras.layers.Layer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mask_initializer = initializers.get("glorot_uniform")
+
     def build(self, input_shape: tf.TensorShape) -> None:
         self.mask_vector = self.add_weight(
-            shape=(1, 1, input_shape[-1]), name="mask_vector",
+            shape=(1, 1, input_shape[-1]),
+            initializer=self.mask_initializer,
+            name="mask_vector",
         )
         self.built = True
 
@@ -209,10 +216,13 @@ def x_masked():
 class CRF(tf.keras.layers.Layer):
     def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
         super().__init__(name=name)
-
+        initializer = initializers.get("glorot_uniform")
         regularizer = tf.keras.regularizers.l1(reg_lambda)
         self.transition_params = self.add_weight(
-            shape=(num_tags, num_tags), regularizer=regularizer, name="transitions",
+            shape=(num_tags, num_tags),
+            initializer=initializer,
+            regularizer=regularizer,
+            name="transitions",
         )
 
     def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:

From c50a0e3e37ad0f4542549daad603a0f132856249 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 11 Feb 2020 15:42:53 +0100
Subject: [PATCH 344/633] update components.rst #2

---
 docs/nlu/components.rst | 402 ++++++++++++++++++++--------------------
 1 file changed, 204 insertions(+), 198 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 3f853c5f065b..b43151901953 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -631,11 +631,6 @@ EmbeddingIntentClassifier
     additional hidden layers are added together with dropout.
     This algorithm also provides similarity rankings of the labels that did not "win".
 
-    The ``EmbeddingIntentClassifier`` needs to be preceded by a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
-    by ``SpacyNLP`` and ``SpacyTokenizer``.
-
     .. note:: If during prediction time a message contains **only** words unseen during training,
               and no Out-Of-Vacabulary preprocessor was used,
               empty intent ``None`` is predicted with confidence ``0.0``.
@@ -644,7 +639,6 @@ EmbeddingIntentClassifier
         ``EmbeddingIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
         `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
 
-
 :Configuration:
 
     The algorithm has hyperparameters to control:
@@ -675,7 +669,7 @@ EmbeddingIntentClassifier
 
         - embedding:
 
-            - ``dense_dimension`` sets the dimensions
+            - ``dense_dimension`` sets the dense dimensions to use for sparse tensors if no dense features are present
             - ``embedding_dimension`` sets the dimension of embedding space;
             - ``number_of_negative_examples`` sets the number of incorrect intent labels,
               the algorithm will minimize their similarity to the user
@@ -712,7 +706,8 @@ EmbeddingIntentClassifier
               would drop out ``10%`` of input units;
             - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not
 
-    .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
+    .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
+              be between ``-1`` and ``1``.
 
     .. note:: There is an option to use linearly increasing batch size. The idea comes from
               `<https://arxiv.org/abs/1711.00489>`_.
@@ -732,132 +727,6 @@ EmbeddingIntentClassifier
               ``use_maximum_negative_similarity = False``.
               See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
 
-
-DIETClassifier
-~~~~~~~~~~~~~~
-
-:Short: Dual Intent Entity Transformer used for intent classification and entity extraction
-:Outputs: ``intent`` and ``intent_ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
-:Output-Example:
-
-    .. code-block:: json
-
-        {
-            "intent": {"name": "greet", "confidence": 0.8343},
-            "intent_ranking": [
-                {
-                    "confidence": 0.385910906220309,
-                    "name": "goodbye"
-                },
-                {
-                    "confidence": 0.28161531595656784,
-                    "name": "restaurant_search"
-                }
-            ]
-        }
-
-:Description:
-    The ``DIETClassifier`` embeds user inputs and intent labels into the same space.
-    Supervised embeddings are trained by maximizing similarity between them.
-    This algorithm is based on `StarSpace <https://arxiv.org/abs/1709.03856>`_.
-    However, in this implementation the loss function is slightly different and
-    additional hidden layers are added together with dropout.
-    This algorithm also provides similarity rankings of the labels that did not "win".
-
-    The ``DIETClassifier`` needs to be preceded by a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
-    by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty intent ``None`` is predicted with confidence ``0.0``.
-
-:Configuration:
-
-    The algorithm also has hyperparameters to control:
-
-        - neural network's architecture:
-
-            - ``hidden_layers_sizes_a`` sets a list of hidden layer sizes before
-              the embedding layer for user inputs, the number of hidden layers
-              is equal to the length of the list
-            - ``hidden_layers_sizes_b`` sets a list of hidden layer sizes before
-              the embedding layer for intent labels, the number of hidden layers
-              is equal to the length of the list
-            - ``share_hidden`` if set to True, shares the hidden layers between user inputs and intent label
-
-        - training:
-
-            - ``batch_size`` sets the number of training examples in one
-              forward/backward pass, the higher the batch size, the more
-              memory space you'll need;
-            - ``batch_strategy`` sets the type of batching strategy,
-              it should be either ``sequence`` or ``balanced``;
-            - ``epochs`` sets the number of times the algorithm will see
-              training data, where one ``epoch`` equals one forward pass and
-              one backward pass of all the training examples;
-            - ``random_seed`` if set to any int will get reproducible
-              training results for the same inputs;
-
-        - embedding:
-
-            - ``embed_dim`` sets the dimension of embedding space;
-            - ``num_neg`` sets the number of incorrect intent labels,
-              the algorithm will minimize their similarity to the user
-              input during training;
-            - ``similarity_type`` sets the type of the similarity,
-              it should be either ``auto``, ``cosine`` or ``inner``,
-              if ``auto``, it will be set depending on ``loss_type``,
-              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
-            - ``loss_type`` sets the type of the loss function,
-              it should be either ``softmax`` or ``margin``;
-            - ``ranking_length`` defines the number of top confidences over
-              which to normalize ranking results if ``loss_type: "softmax"``;
-              to turn off normalization set it to 0
-            - ``mu_pos`` controls how similar the algorithm should try
-              to make embedding vectors for correct intent labels,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``mu_neg`` controls maximum negative similarity for
-              incorrect intents,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``use_max_sim_neg`` if ``true`` the algorithm only
-              minimizes maximum similarity over incorrect intent labels,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``scale_loss`` if ``true`` the algorithm will downscale the loss
-              for examples where correct label is predicted with high confidence,
-              used only if ``loss_type`` is set to ``softmax``;
-
-        - regularization:
-
-            - ``C2`` sets the scale of L2 regularization
-            - ``C_emb`` sets the scale of how important is to minimize
-              the maximum similarity between embeddings of different intent labels;
-            - ``droprate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1``
-              would drop out ``10%`` of input units;
-
-    .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
-
-    .. note:: There is an option to use linearly increasing batch size. The idea comes from
-              `<https://arxiv.org/abs/1711.00489>`_.
-              In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
-              If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
-
-    In the config, you can specify these parameters.
-    The default values are defined in ``DIETClassifier.defaults``:
-
-    .. literalinclude:: ../../rasa/nlu/classifiers/diet_classifier.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
-
-    .. note:: Parameter ``mu_neg`` is set to a negative value to mimic the original
-              starspace algorithm in the case ``mu_neg = mu_pos`` and ``use_max_sim_neg = False``.
-              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
-
-
 .. _keyword_intent_classifier:
 
 KeywordIntentClassifier
@@ -904,20 +773,13 @@ Response Selector
 
 :Short: Response Selector
 :Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
-:Requires: A featurizer
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and response
 
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "text": "What is the recommend python version to install?",
-            "entities": [],
-            "intent": {"confidence": 0.6485910906220309, "name": "faq"},
-            "intent_ranking": [
-                {"confidence": 0.6485910906220309, "name": "faq"},
-                {"confidence": 0.1416153159565678, "name": "greet"}
-            ],
             "response_selector": {
               "faq": {
                 "response": {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
@@ -936,11 +798,6 @@ Response Selector
     It embeds user inputs and response labels into the same space and follows the exact same
     neural network architecture and optimization as the ``DIETClassifier``.
 
-    The response selector needs to be preceded by a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
-    by ``SpacyNLP``.
-
     .. note:: If during prediction time a message contains **only** words unseen during training,
               and no Out-Of-Vacabulary preprocessor was used,
               empty response ``None`` is predicted with confidence ``0.0``.
@@ -950,7 +807,8 @@ Response Selector
     The algorithm includes all the hyperparameters that ``DIETClassifier`` uses.
     In addition, the component can also be configured to train a response selector for a particular retrieval intent
 
-        - ``retrieval_intent``: sets the name of the intent for which this response selector model is trained. Default ``None``
+        - ``retrieval_intent``: sets the name of the intent for which this response selector model is trained.
+          Default ``None``
 
     In the config, you can specify these parameters.
     The default values are defined in ``ResponseSelector.defaults``:
@@ -968,23 +826,25 @@ MitieEntityExtractor
 ~~~~~~~~~~~~~~~~~~~~
 
 :Short: MITIE entity extraction (using a `MITIE NER trainer <https://github.com/mit-nlp/MITIE/blob/master/mitielib/src/ner_trainer.cpp>`_)
-:Outputs: appends ``entities``
-:Requires: :ref:`MitieNLP`
+:Outputs: ``entities``
+:Requires: :ref:`MitieNLP` and ``tokens``
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"value": "New York City",
-                          "start": 20,
-                          "end": 33,
-                          "confidence": null,
-                          "entity": "city",
-                          "extractor": "MitieEntityExtractor"}]
+            "entities": [{
+                "value": "New York City",
+                "start": 20,
+                "end": 33,
+                "confidence": null,
+                "entity": "city",
+                "extractor": "MitieEntityExtractor"
+            }]
         }
 
 :Description:
-    This uses the MITIE entity extraction to find entities in a message. The underlying classifier
+    ``MitieEntityExtractor`` uses the MITIE entity extraction to find entities in a message. The underlying classifier
     is using a multi class linear SVM with a sparse linear kernel and custom features.
     The MITIE component does not provide entity confidence values.
 :Configuration:
@@ -1000,28 +860,30 @@ SpacyEntityExtractor
 ~~~~~~~~~~~~~~~~~~~~
 
 :Short: spaCy entity extraction
-:Outputs: appends ``entities``
+:Outputs: ``entities``
 :Requires: :ref:`SpacyNLP`
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"value": "New York City",
-                          "start": 20,
-                          "end": 33,
-                          "entity": "city",
-                          "confidence": null,
-                          "extractor": "SpacyEntityExtractor"}]
+            "entities": [{
+                "value": "New York City",
+                "start": 20,
+                "end": 33,
+                "confidence": null,
+                "entity": "city",
+                "extractor": "SpacyEntityExtractor"
+            }]
         }
 
 :Description:
-    Using spaCy this component predicts the entities of a message. spacy uses a statistical BILOU transition model.
-    As of now, this component can only use the spacy builtin entity extraction models and can not be retrained.
+    Using spaCy this component predicts the entities of a message. spaCy uses a statistical BILOU transition model.
+    As of now, this component can only use the spaCy builtin entity extraction models and can not be retrained.
     This extractor does not provide any confidence scores.
 
 :Configuration:
-    Configure which dimensions, i.e. entity types, the spacy component
+    Configure which dimensions, i.e. entity types, the spaCy component
     should extract. A full list of available dimensions can be found in
     the `spaCy documentation <https://spacy.io/api/annotation#section-named-entities>`_.
     Leaving the dimensions option unspecified will extract all available dimensions.
@@ -1037,7 +899,6 @@ SpacyEntityExtractor
 EntitySynonymMapper
 ~~~~~~~~~~~~~~~~~~~
 
-
 :Short: Maps synonymous entity values to the same value.
 :Outputs: modifies existing entities that previous entity extraction components found
 :Requires: nothing
@@ -1048,46 +909,59 @@ EntitySynonymMapper
 
     .. code-block:: json
 
-        [{
-          "text": "I moved to New York City",
-          "intent": "inform_relocation",
-          "entities": [{"value": "nyc",
-                        "start": 11,
-                        "end": 24,
-                        "entity": "city",
-                       }]
-        },
-        {
-          "text": "I got a new flat in NYC.",
-          "intent": "inform_relocation",
-          "entities": [{"value": "nyc",
-                        "start": 20,
-                        "end": 23,
-                        "entity": "city",
-                       }]
-        }]
-
-    This component will allow you to map the entities ``New York City`` and ``NYC`` to ``nyc``. The entitiy
+        [
+            {
+              "text": "I moved to New York City",
+              "intent": "inform_relocation",
+              "entities": [{
+                "value": "nyc",
+                "start": 11,
+                "end": 24,
+                "entity": "city",
+              }]
+            },
+            {
+              "text": "I got a new flat in NYC.",
+              "intent": "inform_relocation",
+              "entities": [{
+                "value": "nyc",
+                "start": 20,
+                "end": 23,
+                "entity": "city",
+              }]
+            }
+        ]
+
+    This component will allow you to map the entities ``New York City`` and ``NYC`` to ``nyc``. The entity
     extraction will return ``nyc`` even though the message contains ``NYC``. When this component changes an
-    exisiting entity, it appends itself to the processor list of this entity.
+    existing entity, it appends itself to the processor list of this entity.
+
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "EntitySynonymMapper"
 
 CRFEntityExtractor
 ~~~~~~~~~~~~~~~~~~
 
-:Short: conditional random field entity extraction
-:Outputs: appends ``entities``
-:Requires: A tokenizer
+:Short: CRF (conditional random field) entity extraction
+:Outputs: ``entities``
+:Requires: ``tokens`` and ``dense_features`` (optional)
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"value":"New York City",
-                          "start": 20,
-                          "end": 33,
-                          "entity": "city",
-                          "confidence": 0.874,
-                          "extractor": "CRFEntityExtractor"}]
+            "entities": [{
+                "value":"New York City",
+                "start": 20,
+                "end": 33,
+                "entity": "city",
+                "confidence": 0.874,
+                "extractor": "CRFEntityExtractor"
+            }]
         }
 
 :Description:
@@ -1099,6 +973,11 @@ CRFEntityExtractor
     If POS features are used (pos or pos2), spaCy has to be installed. If you want to use
     additional features, such as pre-trained word embeddings, from any provided dense
     featurizer, use ``"text_dense_features"``.
+
+    .. warning::
+        ``CRFEntityExtractor`` is deprecated and should be replaced by ``DIETClassifier``. See
+        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
+
 :Configuration:
    .. code-block:: yaml
 
@@ -1195,3 +1074,130 @@ DucklingHTTPExtractor
           # Timeout for receiving response from http url of the running duckling server
           # if not set the default timeout of duckling http url is set to 3 seconds.
           timeout : 3
+
+
+Combined Entity Extraction and Intent Classification
+----------------------------------------------------
+
+DIETClassifier
+~~~~~~~~~~~~~~
+
+:Short: Dual Intent Entity Transformer used for intent classification and entity extraction
+:Outputs: ``entities``, ``intent`` and ``intent_ranking``
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
+:Output-Example:
+
+    .. code-block:: json
+
+        {
+            "intent": {"name": "greet", "confidence": 0.8343},
+            "intent_ranking": [
+                {
+                    "confidence": 0.385910906220309,
+                    "name": "goodbye"
+                },
+                {
+                    "confidence": 0.28161531595656784,
+                    "name": "restaurant_search"
+                }
+            ],
+            "entities": [{
+                "end": 53,
+                "entity": "time",
+                "start": 48,
+                "value": "2017-04-10T00:00:00.000+02:00",
+                "confidence": 1.0,
+                "extractor": "DIETClassifier"
+            }]
+        }
+
+:Description:
+    TODO
+
+    .. note:: If during prediction time a message contains **only** words unseen during training,
+              and no Out-Of-Vacabulary preprocessor was used,
+              empty intent ``None`` is predicted with confidence ``0.0``.
+
+:Configuration:
+
+    The algorithm also has hyperparameters to control:
+
+        - neural network's architecture:
+
+            - ``hidden_layers_sizes_a`` sets a list of hidden layer sizes before
+              the embedding layer for user inputs, the number of hidden layers
+              is equal to the length of the list
+            - ``hidden_layers_sizes_b`` sets a list of hidden layer sizes before
+              the embedding layer for intent labels, the number of hidden layers
+              is equal to the length of the list
+            - ``share_hidden`` if set to True, shares the hidden layers between user inputs and intent label
+
+        - training:
+
+            - ``batch_size`` sets the number of training examples in one
+              forward/backward pass, the higher the batch size, the more
+              memory space you'll need;
+            - ``batch_strategy`` sets the type of batching strategy,
+              it should be either ``sequence`` or ``balanced``;
+            - ``epochs`` sets the number of times the algorithm will see
+              training data, where one ``epoch`` equals one forward pass and
+              one backward pass of all the training examples;
+            - ``random_seed`` if set to any int will get reproducible
+              training results for the same inputs;
+
+        - embedding:
+
+            - ``embed_dim`` sets the dimension of embedding space;
+            - ``num_neg`` sets the number of incorrect intent labels,
+              the algorithm will minimize their similarity to the user
+              input during training;
+            - ``similarity_type`` sets the type of the similarity,
+              it should be either ``auto``, ``cosine`` or ``inner``,
+              if ``auto``, it will be set depending on ``loss_type``,
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+            - ``loss_type`` sets the type of the loss function,
+              it should be either ``softmax`` or ``margin``;
+            - ``ranking_length`` defines the number of top confidences over
+              which to normalize ranking results if ``loss_type: "softmax"``;
+              to turn off normalization set it to 0
+            - ``mu_pos`` controls how similar the algorithm should try
+              to make embedding vectors for correct intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``mu_neg`` controls maximum negative similarity for
+              incorrect intents,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``use_max_sim_neg`` if ``true`` the algorithm only
+              minimizes maximum similarity over incorrect intent labels,
+              used only if ``loss_type`` is set to ``margin``;
+            - ``scale_loss`` if ``true`` the algorithm will downscale the loss
+              for examples where correct label is predicted with high confidence,
+              used only if ``loss_type`` is set to ``softmax``;
+
+        - regularization:
+
+            - ``C2`` sets the scale of L2 regularization
+            - ``C_emb`` sets the scale of how important is to minimize
+              the maximum similarity between embeddings of different intent labels;
+            - ``droprate`` sets the dropout rate, it should be
+              between ``0`` and ``1``, e.g. ``droprate=0.1``
+              would drop out ``10%`` of input units;
+
+    .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
+
+    .. note:: There is an option to use linearly increasing batch size. The idea comes from
+              `<https://arxiv.org/abs/1711.00489>`_.
+              In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
+              If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
+
+    In the config, you can specify these parameters.
+    The default values are defined in ``DIETClassifier.defaults``:
+
+    .. literalinclude:: ../../rasa/nlu/classifiers/diet_classifier.py
+       :dedent: 4
+       :start-after: # default properties (DOC MARKER - don't remove)
+       :end-before: # end default properties (DOC MARKER - don't remove)
+
+    .. note:: Parameter ``mu_neg`` is set to a negative value to mimic the original
+              starspace algorithm in the case ``mu_neg = mu_pos`` and ``use_max_sim_neg = False``.
+              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
+

From 554df2778a55be6611aa7ba58a95294bc1bd50d2 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 15:49:09 +0100
Subject: [PATCH 345/633] add rel attn options to defaults

---
 rasa/nlu/classifiers/diet_classifier.py           | 2 +-
 rasa/nlu/selectors/embedding_response_selector.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 8f3449549983..e3d6ec6e73d7 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -60,6 +60,7 @@
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
+    DROPRATE_ATTENTION,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -68,7 +69,6 @@
     MU_POS,
     EMBED_DIM,
     BILOU_FLAG,
-    DROPRATE_ATTENTION,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 291c7b11b7c1..165322fa888f 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -39,6 +39,9 @@
     MU_POS,
     EMBED_DIM,
     BILOU_FLAG,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
 )
 from rasa.nlu.constants import (
     RESPONSE_ATTRIBUTE,
@@ -163,6 +166,12 @@ class ResponseSelector(DIETClassifier):
         "retrieval_intent": None,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
     }
     # end default properties (DOC MARKER - don't remove)
 

From 5def753b3046ab0f7927e36c94255b1a86b74d55 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 16:51:19 +0100
Subject: [PATCH 346/633] TEMPORARY commit to skip tests that take too long

---
 tests/cli/test_rasa_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 50d21c6e6978..e8b30bc27d54 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -4,7 +4,7 @@
 from rasa.utils.io import list_files, write_yaml_file
 from typing import Callable
 from _pytest.pytester import RunResult
-
+import pytest
 
 def test_test_core(run_in_default_project: Callable[..., RunResult]):
     run_in_default_project("test", "core", "--stories", "data")
@@ -58,6 +58,7 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
     assert os.path.exists("results/confmat.png")
 
 
+@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     copyfile("config.yml", "nlu-config.yml")
 
@@ -69,6 +70,7 @@ def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists("results/run_2")
 
 
+@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     files = list_files("models")
     copyfile(files[0], "models/copy-model.tar.gz")
@@ -86,6 +88,7 @@ def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
 
 
+@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison_after_train(
     run_in_default_project: Callable[..., RunResult]
 ):

From 2cf37175c87ea312f37f20bf1ea05158c4d87fc2 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 16:54:53 +0100
Subject: [PATCH 347/633] PEP

---
 tests/cli/test_rasa_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index e8b30bc27d54..f1515b78c30d 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -6,6 +6,7 @@
 from _pytest.pytester import RunResult
 import pytest
 
+
 def test_test_core(run_in_default_project: Callable[..., RunResult]):
     run_in_default_project("test", "core", "--stories", "data")
 
@@ -88,7 +89,6 @@ def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
 
 
-@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison_after_train(
     run_in_default_project: Callable[..., RunResult]
 ):

From 26c25901def05a10de77a87139297455237a7e5e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 16:58:37 +0100
Subject: [PATCH 348/633] remove TEMP commit

---
 tests/cli/test_rasa_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index f1515b78c30d..50d21c6e6978 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -4,7 +4,6 @@
 from rasa.utils.io import list_files, write_yaml_file
 from typing import Callable
 from _pytest.pytester import RunResult
-import pytest
 
 
 def test_test_core(run_in_default_project: Callable[..., RunResult]):
@@ -59,7 +58,6 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
     assert os.path.exists("results/confmat.png")
 
 
-@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     copyfile("config.yml", "nlu-config.yml")
 
@@ -71,7 +69,6 @@ def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists("results/run_2")
 
 
-@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     files = list_files("models")
     copyfile(files[0], "models/copy-model.tar.gz")

From 147124f737079c5cb65a2871c29583f7af0a119f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 17:14:12 +0100
Subject: [PATCH 349/633] TEMPORARY commit to skip tests that take too long

---
 tests/cli/test_rasa_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 50d21c6e6978..2051210683e9 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -4,6 +4,7 @@
 from rasa.utils.io import list_files, write_yaml_file
 from typing import Callable
 from _pytest.pytester import RunResult
+import pytest
 
 
 def test_test_core(run_in_default_project: Callable[..., RunResult]):
@@ -58,6 +59,7 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
     assert os.path.exists("results/confmat.png")
 
 
+@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     copyfile("config.yml", "nlu-config.yml")
 
@@ -69,6 +71,7 @@ def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists("results/run_2")
 
 
+@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     files = list_files("models")
     copyfile(files[0], "models/copy-model.tar.gz")
@@ -86,6 +89,7 @@ def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
 
 
+@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison_after_train(
     run_in_default_project: Callable[..., RunResult]
 ):

From 719b3f4d38eea7fbaddf39f86889ace319ac7a2c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 17:14:38 +0100
Subject: [PATCH 350/633] remove TEMP commit

---
 tests/cli/test_rasa_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 2051210683e9..50d21c6e6978 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -4,7 +4,6 @@
 from rasa.utils.io import list_files, write_yaml_file
 from typing import Callable
 from _pytest.pytester import RunResult
-import pytest
 
 
 def test_test_core(run_in_default_project: Callable[..., RunResult]):
@@ -59,7 +58,6 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
     assert os.path.exists("results/confmat.png")
 
 
-@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     copyfile("config.yml", "nlu-config.yml")
 
@@ -71,7 +69,6 @@ def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists("results/run_2")
 
 
-@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     files = list_files("models")
     copyfile(files[0], "models/copy-model.tar.gz")
@@ -89,7 +86,6 @@ def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
 
 
-@pytest.mark.skip(reason="Takes forever.")  # TODO remove
 def test_test_core_comparison_after_train(
     run_in_default_project: Callable[..., RunResult]
 ):

From 193b39e84914013016766234d42d6976be22ab93 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 18:02:07 +0100
Subject: [PATCH 351/633] fix random seed

---
 rasa/nlu/classifiers/diet_classifier.py |  3 ++-
 rasa/utils/tensorflow/layers.py         | 17 ++++++++---------
 tests/nlu/training/test_train.py        |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index e3d6ec6e73d7..0cb6d17ea41c 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1226,8 +1226,9 @@ def _entity_loss(
 
         logits = self._tf_layers["embed.logits"](a)
 
-        loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
+        # should call first to build weights
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
+        loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
 
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index e81c16b1cc40..6b776cf04563 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -155,14 +155,10 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
 
 
 class InputMask(tf.keras.layers.Layer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.mask_initializer = initializers.get("glorot_uniform")
 
     def build(self, input_shape: tf.TensorShape) -> None:
         self.mask_vector = self.add_weight(
             shape=(1, 1, input_shape[-1]),
-            initializer=self.mask_initializer,
             name="mask_vector",
         )
         self.built = True
@@ -216,14 +212,17 @@ def x_masked():
 class CRF(tf.keras.layers.Layer):
     def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
         super().__init__(name=name)
-        initializer = initializers.get("glorot_uniform")
-        regularizer = tf.keras.regularizers.l1(reg_lambda)
+        self.num_tags = num_tags
+        self.regularizer = tf.keras.regularizers.l1(reg_lambda)
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        # should be created in `build` to apply random_seed
         self.transition_params = self.add_weight(
-            shape=(num_tags, num_tags),
-            initializer=initializer,
-            regularizer=regularizer,
+            shape=(self.num_tags, self.num_tags),
+            regularizer=self.regularizer,
             name="transitions",
         )
+        self.built = True
 
     def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
         pred_ids, _ = tfa.text.crf.crf_decode(
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index a6f3e9b46fd6..0b2d416927db 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -98,7 +98,7 @@ async def test_random_seed(component_builder, tmpdir):
     """test if train result is the same for two runs of tf embedding"""
 
     _config = utilities.base_test_conf("supervised_embeddings")
-    # set fixed random seed of the embedding intent classifier to 1
+    # set fixed random seed of the DIET classifier to 1
     _config.set_component_attr(5, random_seed=1)
 
     # first run

From 3a0de01337b8f9d2d4e9a8e1fa69e67a7c3bea5c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 22:13:23 +0100
Subject: [PATCH 352/633] TEMPORARY delete cli tests

---
 tests/cli/__init__.py              |   0
 tests/cli/conftest.py              |  34 ---
 tests/cli/test_cli.py              |  37 ---
 tests/cli/test_rasa_data.py        |  78 -----
 tests/cli/test_rasa_init.py        |  53 ----
 tests/cli/test_rasa_interactive.py | 158 ----------
 tests/cli/test_rasa_run.py         |  46 ---
 tests/cli/test_rasa_shell.py       |  34 ---
 tests/cli/test_rasa_test.py        | 194 ------------
 tests/cli/test_rasa_train.py       | 464 -----------------------------
 tests/cli/test_rasa_visualize.py   |  15 -
 tests/cli/test_rasa_x.py           | 152 ----------
 tests/cli/test_utils.py            | 105 -------
 13 files changed, 1370 deletions(-)
 delete mode 100644 tests/cli/__init__.py
 delete mode 100644 tests/cli/conftest.py
 delete mode 100644 tests/cli/test_cli.py
 delete mode 100644 tests/cli/test_rasa_data.py
 delete mode 100644 tests/cli/test_rasa_init.py
 delete mode 100644 tests/cli/test_rasa_interactive.py
 delete mode 100644 tests/cli/test_rasa_run.py
 delete mode 100644 tests/cli/test_rasa_shell.py
 delete mode 100644 tests/cli/test_rasa_test.py
 delete mode 100644 tests/cli/test_rasa_train.py
 delete mode 100644 tests/cli/test_rasa_visualize.py
 delete mode 100644 tests/cli/test_rasa_x.py
 delete mode 100644 tests/cli/test_utils.py

diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
deleted file mode 100644
index b7294eaeadff..000000000000
--- a/tests/cli/conftest.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from typing import Callable
-import pytest
-import os
-from _pytest.pytester import Testdir, RunResult
-
-
-@pytest.fixture
-def run(testdir: Testdir) -> Callable[..., RunResult]:
-    def do_run(*args):
-        args = ["rasa"] + list(args)
-        return testdir.run(*args)
-
-    return do_run
-
-
-@pytest.fixture
-def run_with_stdin(testdir: Testdir) -> Callable[..., RunResult]:
-    def do_run(*args, stdin):
-        args = ["rasa"] + list(args)
-        return testdir.run(*args, stdin=stdin)
-
-    return do_run
-
-
-@pytest.fixture
-def run_in_default_project(testdir: Testdir) -> Callable[..., RunResult]:
-    os.environ["LOG_LEVEL"] = "ERROR"
-    testdir.run("rasa", "init", "--no-prompt")
-
-    def do_run(*args):
-        args = ["rasa"] + list(args)
-        return testdir.run(*args)
-
-    return do_run
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
deleted file mode 100644
index dc221349b647..000000000000
--- a/tests/cli/test_cli.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pytest
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_cli_start(run: Callable[..., RunResult]):
-    """
-    Measures an average startup time and checks that it
-    does not deviate more than x seconds from 5.
-    """
-    import time
-
-    durations = []
-
-    for i in range(5):
-        start = time.time()
-        run("--help")
-        end = time.time()
-
-        durations.append(end - start)
-
-    avg_duration = sum(durations) / len(durations)
-
-    # When run in parallel, it takes a little longer
-    assert avg_duration - 5 <= 2
-
-
-def test_data_convert_help(run: Callable[..., RunResult]):
-    output = run("--help")
-
-    help_text = """usage: rasa [-h] [--version]
-            {init,run,shell,train,interactive,test,visualize,data,x} ..."""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
deleted file mode 100644
index 3021e9ab12e7..000000000000
--- a/tests/cli/test_rasa_data.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-import pytest
-from collections import namedtuple
-from typing import Callable
-from _pytest.pytester import RunResult
-from rasa.cli import data
-
-
-def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "data", "split", "nlu", "-u", "data/nlu.md", "--training-fraction", "0.75"
-    )
-
-    assert os.path.exists("train_test_split")
-    assert os.path.exists(os.path.join("train_test_split", "test_data.md"))
-    assert os.path.exists(os.path.join("train_test_split", "training_data.md"))
-
-
-def test_data_convert_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "data",
-        "convert",
-        "nlu",
-        "--data",
-        "data/nlu.md",
-        "--out",
-        "out_nlu_data.json",
-        "-f",
-        "json",
-    )
-
-    assert os.path.exists("out_nlu_data.json")
-
-
-def test_data_split_help(run: Callable[..., RunResult]):
-    output = run("data", "split", "nlu", "--help")
-
-    help_text = """usage: rasa data split nlu [-h] [-v] [-vv] [--quiet] [-u NLU]
-                           [--training-fraction TRAINING_FRACTION]
-                           [--random-seed RANDOM_SEED] [--out OUT]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_data_convert_help(run: Callable[..., RunResult]):
-    output = run("data", "convert", "nlu", "--help")
-
-    help_text = """usage: rasa data convert nlu [-h] [-v] [-vv] [--quiet] --data DATA --out OUT
-                             [-l LANGUAGE] -f {json,md}"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_data_validate_help(run: Callable[..., RunResult]):
-    output = run("data", "validate", "--help")
-
-    help_text = """usage: rasa data validate [-h] [-v] [-vv] [--quiet] [--fail-on-warnings]
-                          [-d DOMAIN] [--data DATA]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_validate_files_exit_early():
-    with pytest.raises(SystemExit) as pytest_e:
-        args = {"domain": "data/test_domains/duplicate_intents.yml", "data": None}
-        data.validate_files(namedtuple("Args", args.keys())(*args.values()))
-
-    assert pytest_e.type == SystemExit
-    assert pytest_e.value.code == 1
diff --git a/tests/cli/test_rasa_init.py b/tests/cli/test_rasa_init.py
deleted file mode 100644
index c23ba563dda5..000000000000
--- a/tests/cli/test_rasa_init.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_init(run: Callable[..., RunResult]):
-    run("init", "--no-prompt", "--quiet")
-
-    assert os.path.exists("actions.py")
-    assert os.path.exists("domain.yml")
-    assert os.path.exists("config.yml")
-    assert os.path.exists("credentials.yml")
-    assert os.path.exists("endpoints.yml")
-    assert os.path.exists("models")
-    assert os.path.exists("data/nlu.md")
-    assert os.path.exists("data/stories.md")
-
-
-def test_init_using_init_dir_option(run: Callable[..., RunResult]):
-    os.makedirs("./workspace")
-    run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
-
-    assert os.path.exists("./workspace/actions.py")
-    assert os.path.exists("./workspace/domain.yml")
-    assert os.path.exists("./workspace/config.yml")
-    assert os.path.exists("./workspace/credentials.yml")
-    assert os.path.exists("./workspace/endpoints.yml")
-    assert os.path.exists("./workspace/models")
-    assert os.path.exists("./workspace/data/nlu.md")
-    assert os.path.exists("./workspace/data/stories.md")
-
-
-def test_not_fount_init_path(run: Callable[..., RunResult]):
-    output = run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
-
-    assert (
-        output.outlines[-1]
-        == "\033[91mProject init path './workspace' not found.\033[0m"
-    )
-
-
-def test_init_help(run: Callable[..., RunResult]):
-    output = run("init", "--help")
-
-    assert (
-        output.outlines[0]
-        == "usage: rasa init [-h] [-v] [-vv] [--quiet] [--no-prompt] [--init-dir INIT_DIR]"
-    )
-
-
-def test_user_asked_to_train_model(run_with_stdin: Callable[..., RunResult]):
-    run_with_stdin("init", stdin=b"\nYN")
-    assert not os.path.exists("models")
diff --git a/tests/cli/test_rasa_interactive.py b/tests/cli/test_rasa_interactive.py
deleted file mode 100644
index eabc315d1089..000000000000
--- a/tests/cli/test_rasa_interactive.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import argparse
-import pytest
-from typing import Callable, Text
-from unittest.mock import Mock
-
-from _pytest.monkeypatch import MonkeyPatch
-from _pytest.pytester import RunResult
-
-import rasa
-from rasa.cli import interactive, train
-
-
-def test_interactive_help(run: Callable[..., RunResult]):
-    output = run("interactive", "--help")
-
-    help_text = """usage: rasa interactive [-h] [-v] [-vv] [--quiet] [--e2e] [-m MODEL]
-                        [--data DATA [DATA ...]] [--skip-visualization]
-                        [--endpoints ENDPOINTS] [-c CONFIG] [-d DOMAIN]
-                        [--out OUT] [--augmentation AUGMENTATION]
-                        [--debug-plots] [--dump-stories] [--force]
-                        [--persist-nlu-data]
-                        {core} ... [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_interactive_core_help(run: Callable[..., RunResult]):
-    output = run("interactive", "core", "--help")
-
-    help_text = """usage: rasa interactive core [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
-                             [--skip-visualization] [--endpoints ENDPOINTS]
-                             [-c CONFIG] [-d DOMAIN] [--out OUT]
-                             [--augmentation AUGMENTATION] [--debug-plots]
-                             [--dump-stories]
-                             [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_pass_arguments_to_rasa_train(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    # Create parser
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    # Parse interactive command
-    args = parser.parse_args(["interactive", "--config", default_stack_config])
-    interactive._set_not_required_args(args)
-
-    # Mock actual training
-    mock = Mock()
-    monkeypatch.setattr(rasa, "train", mock.method)
-
-    # If the `Namespace` object does not have all required fields this will throw
-    train.train(args)
-
-    # Assert `train` was actually called
-    mock.method.assert_called_once()
-
-
-def test_train_called_when_no_model_passed(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    args = parser.parse_args(
-        [
-            "interactive",
-            "--config",
-            default_stack_config,
-            "--data",
-            "examples/moodbot/data",
-        ]
-    )
-    interactive._set_not_required_args(args)
-
-    # Mock actual training and interactive learning methods
-    mock = Mock()
-    monkeypatch.setattr(train, "train", mock.train_model)
-    monkeypatch.setattr(
-        interactive, "perform_interactive_learning", mock.perform_interactive_learning
-    )
-
-    interactive.interactive(args)
-    mock.train_model.assert_called_once()
-
-
-def test_train_core_called_when_no_model_passed_and_core(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    args = parser.parse_args(
-        [
-            "interactive",
-            "core",
-            "--config",
-            default_stack_config,
-            "--stories",
-            "examples/moodbot/data/stories.md",
-            "--domain",
-            "examples/moodbot/domain.yml",
-        ]
-    )
-    interactive._set_not_required_args(args)
-
-    # Mock actual training and interactive learning methods
-    mock = Mock()
-    monkeypatch.setattr(train, "train_core", mock.train_core)
-    monkeypatch.setattr(
-        interactive, "perform_interactive_learning", mock.perform_interactive_learning
-    )
-
-    interactive.interactive(args)
-    mock.train_core.assert_called_once()
-
-
-def test_no_interactive_without_core_data(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    args = parser.parse_args(
-        [
-            "interactive",
-            "--config",
-            default_stack_config,
-            "--data",
-            "examples/moodbot/data/nlu.md",
-        ]
-    )
-    interactive._set_not_required_args(args)
-
-    mock = Mock()
-    monkeypatch.setattr(train, "train", mock.train_model)
-    monkeypatch.setattr(
-        interactive, "perform_interactive_learning", mock.perform_interactive_learning
-    )
-
-    with pytest.raises(SystemExit):
-        interactive.interactive(args)
-
-    mock.train_model.assert_not_called()
-    mock.perform_interactive_learning.assert_not_called()
diff --git a/tests/cli/test_rasa_run.py b/tests/cli/test_rasa_run.py
deleted file mode 100644
index 8de8685e91dd..000000000000
--- a/tests/cli/test_rasa_run.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import shutil
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_run_does_not_start(run_in_default_project: Callable[..., RunResult]):
-    os.remove("domain.yml")
-    shutil.rmtree("models")
-
-    # the server should not start as no model is configured
-    output = run_in_default_project("run")
-
-    assert "No model found." in output.outlines[0]
-
-
-def test_run_help(run: Callable[..., RunResult]):
-    output = run("run", "--help")
-
-    help_text = """usage: rasa run [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
-                [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
-                [--cors [CORS [CORS ...]]] [--enable-api]
-                [--remote-storage REMOTE_STORAGE]
-                [--ssl-certificate SSL_CERTIFICATE]
-                [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
-                [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
-                [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
-                [--jwt-method JWT_METHOD]
-                {actions} ... [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_run_action_help(run: Callable[..., RunResult]):
-    output = run("run", "actions", "--help")
-
-    help_text = """usage: rasa run actions [-h] [-v] [-vv] [--quiet] [-p PORT]
-                        [--cors [CORS [CORS ...]]] [--actions ACTIONS]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_shell.py b/tests/cli/test_rasa_shell.py
deleted file mode 100644
index 7301db203ec0..000000000000
--- a/tests/cli/test_rasa_shell.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_shell_help(run: Callable[..., RunResult]):
-    output = run("shell", "--help")
-
-    help_text = """usage: rasa shell [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
-                  [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
-                  [--cors [CORS [CORS ...]]] [--enable-api]
-                  [--remote-storage REMOTE_STORAGE]
-                  [--ssl-certificate SSL_CERTIFICATE]
-                  [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
-                  [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
-                  [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
-                  [--jwt-method JWT_METHOD]
-                  {nlu} ... [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_shell_nlu_help(run: Callable[..., RunResult]):
-    output = run("shell", "nlu", "--help")
-
-    help_text = """usage: rasa shell nlu [-h] [-v] [-vv] [--quiet] [-m MODEL]
-                      [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
deleted file mode 100644
index 50d21c6e6978..000000000000
--- a/tests/cli/test_rasa_test.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import os
-from shutil import copyfile
-from rasa.constants import DEFAULT_RESULTS_PATH, RESULTS_FILE
-from rasa.utils.io import list_files, write_yaml_file
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_test_core(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "core", "--stories", "data")
-
-    assert os.path.exists("results")
-
-
-def test_test_core_no_plot(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "core", "--no-plot")
-
-    assert not os.path.exists("results/story_confmat.pdf")
-
-
-def test_test(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test")
-
-    assert os.path.exists("results")
-    assert os.path.exists("results/hist.png")
-    assert os.path.exists("results/confmat.png")
-
-
-def test_test_no_plot(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "--no-plot")
-
-    assert not os.path.exists("results/hist.png")
-    assert not os.path.exists("results/confmat.png")
-    assert not os.path.exists("results/story_confmat.pdf")
-
-
-def test_test_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "nlu", "--nlu", "data", "--successes")
-
-    assert os.path.exists("results/hist.png")
-    assert os.path.exists("results/confmat.png")
-    assert os.path.exists("results/intent_successes.json")
-
-
-def test_test_nlu_no_plot(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "nlu", "--no-plot")
-
-    assert not os.path.exists("results/confmat.png")
-    assert not os.path.exists("results/hist.png")
-
-
-def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "test", "nlu", "--cross-validation", "-c", "config.yml", "-f", "2"
-    )
-
-    assert os.path.exists("results/hist.png")
-    assert os.path.exists("results/confmat.png")
-
-
-def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
-    copyfile("config.yml", "nlu-config.yml")
-
-    run_in_default_project(
-        "test", "nlu", "-c", "config.yml", "nlu-config.yml", "--run", "2"
-    )
-
-    assert os.path.exists("results/run_1")
-    assert os.path.exists("results/run_2")
-
-
-def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
-    files = list_files("models")
-    copyfile(files[0], "models/copy-model.tar.gz")
-
-    run_in_default_project(
-        "test",
-        "core",
-        "-m",
-        files[0],
-        "models/copy-model.tar.gz",
-        "--stories",
-        "data/stories.md",
-    )
-
-    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
-
-
-def test_test_core_comparison_after_train(
-    run_in_default_project: Callable[..., RunResult]
-):
-    write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "KerasPolicy"}],
-        },
-        "config_1.yml",
-    )
-
-    write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "MemoizationPolicy"}],
-        },
-        "config_2.yml",
-    )
-    run_in_default_project(
-        "train",
-        "core",
-        "-c",
-        "config_1.yml",
-        "config_2.yml",
-        "--stories",
-        "data/stories.md",
-        "--runs",
-        "2",
-        "--percentages",
-        "25",
-        "75",
-        "--augmentation",
-        "5",
-        "--out",
-        "comparison_models",
-    )
-
-    assert os.path.exists("comparison_models")
-    assert os.path.exists("comparison_models/run_1")
-    assert os.path.exists("comparison_models/run_2")
-
-    run_in_default_project(
-        "test",
-        "core",
-        "-m",
-        "comparison_models",
-        "--stories",
-        "data/stories",
-        "--evaluate-model-directory",
-    )
-
-    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
-    assert os.path.exists(
-        os.path.join(DEFAULT_RESULTS_PATH, "core_model_comparison_graph.pdf")
-    )
-
-
-def test_test_help(run: Callable[..., RunResult]):
-    output = run("test", "--help")
-
-    help_text = """usage: rasa test [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
-                 [--max-stories MAX_STORIES] [--e2e] [--endpoints ENDPOINTS]
-                 [--fail-on-prediction-errors] [--url URL]
-                 [--evaluate-model-directory] [-u NLU] [--out OUT]
-                 [--successes] [--no-errors] [--histogram HISTOGRAM]
-                 [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
-                 [--cross-validation] [-f FOLDS] [-r RUNS]
-                 [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]
-                 {core,nlu} ..."""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_test_nlu_help(run: Callable[..., RunResult]):
-    output = run("test", "nlu", "--help")
-
-    help_text = """usage: rasa test nlu [-h] [-v] [-vv] [--quiet] [-m MODEL] [-u NLU] [--out OUT]
-                     [--successes] [--no-errors] [--histogram HISTOGRAM]
-                     [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
-                     [--cross-validation] [-f FOLDS] [-r RUNS]
-                     [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_test_core_help(run: Callable[..., RunResult]):
-    output = run("test", "core", "--help")
-
-    help_text = """usage: rasa test core [-h] [-v] [-vv] [--quiet] [-m MODEL [MODEL ...]]
-                      [-s STORIES] [--max-stories MAX_STORIES] [--out OUT]
-                      [--e2e] [--endpoints ENDPOINTS]
-                      [--fail-on-prediction-errors] [--url URL]
-                      [--evaluate-model-directory] [--no-plot]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
deleted file mode 100644
index 1839c77b3ab9..000000000000
--- a/tests/cli/test_rasa_train.py
+++ /dev/null
@@ -1,464 +0,0 @@
-import os
-import shutil
-import tempfile
-
-import pytest
-from typing import Callable
-from _pytest.pytester import RunResult
-
-from rasa import model
-from rasa.nlu.model import Metadata
-from rasa.nlu.training_data import training_data
-from rasa.cli.train import _get_valid_config
-from rasa.constants import (
-    CONFIG_MANDATORY_KEYS_CORE,
-    CONFIG_MANDATORY_KEYS,
-    CONFIG_MANDATORY_KEYS_NLU,
-)
-import rasa.utils.io as io_utils
-
-
-def test_train(run_in_default_project: Callable[..., RunResult]):
-    temp_dir = os.getcwd()
-
-    run_in_default_project(
-        "train",
-        "-c",
-        "config.yml",
-        "-d",
-        "domain.yml",
-        "--data",
-        "data",
-        "--out",
-        "train_models",
-        "--fixed-model-name",
-        "test-model",
-    )
-
-    assert os.path.exists(os.path.join(temp_dir, "train_models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is None
-    assert not os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]):
-    temp_dir = os.getcwd()
-
-    run_in_default_project(
-        "train",
-        "-c",
-        "config.yml",
-        "-d",
-        "domain.yml",
-        "--data",
-        "data",
-        "--out",
-        "train_models",
-        "--fixed-model-name",
-        "test-model",
-        "--persist-nlu-data",
-    )
-
-    assert os.path.exists(os.path.join(temp_dir, "train_models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is not None
-    assert os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
-    temp_dir = os.getcwd()
-
-    io_utils.write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "KerasPolicy"}],
-        },
-        "config_1.yml",
-    )
-
-    io_utils.write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "MemoizationPolicy"}],
-        },
-        "config_2.yml",
-    )
-
-    run_in_default_project(
-        "train",
-        "core",
-        "-c",
-        "config_1.yml",
-        "config_2.yml",
-        "--stories",
-        "data/stories.md",
-        "--out",
-        "core_comparison_results",
-        "--runs",
-        "2",
-        "--percentages",
-        "25",
-        "75",
-        "--augmentation",
-        "5",
-    )
-
-    assert os.path.exists(os.path.join(temp_dir, "core_comparison_results"))
-    run_directories = io_utils.list_subdirectories(
-        os.path.join(temp_dir, "core_comparison_results")
-    )
-    assert len(run_directories) == 2
-    model_files = io_utils.list_files(
-        os.path.join(temp_dir, "core_comparison_results", run_directories[0])
-    )
-    assert len(model_files) == 4
-    assert model_files[0].endswith("tar.gz")
-
-
-def test_train_no_domain_exists(
-    run_in_default_project: Callable[..., RunResult]
-) -> None:
-
-    os.remove("domain.yml")
-    run_in_default_project(
-        "train",
-        "-c",
-        "config.yml",
-        "--data",
-        "data",
-        "--out",
-        "train_models_no_domain",
-        "--fixed-model-name",
-        "nlu-model-only",
-    )
-
-    assert os.path.exists("train_models_no_domain")
-    files = io_utils.list_files("train_models_no_domain")
-    assert len(files) == 1
-
-    trained_model_path = "train_models_no_domain/nlu-model-only.tar.gz"
-    unpacked = model.unpack_model(trained_model_path)
-
-    metadata_path = os.path.join(unpacked, "nlu", "metadata.json")
-    assert os.path.exists(metadata_path)
-
-
-def test_train_skip_on_model_not_changed(
-    run_in_default_project: Callable[..., RunResult]
-):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-
-    file_name = files[0]
-    run_in_default_project("train")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-    assert file_name == files[0]
-
-
-def test_train_force(run_in_default_project):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-
-    run_in_default_project("train", "--force")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 2
-
-
-def test_train_with_only_nlu_data(run_in_default_project):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "data/stories.md"))
-    os.remove(os.path.join(temp_dir, "data/stories.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
-
-    run_in_default_project("train", "--fixed-model-name", "test-model")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-
-
-def test_train_with_only_core_data(run_in_default_project):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "data/nlu.md"))
-    os.remove(os.path.join(temp_dir, "data/nlu.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
-
-    run_in_default_project("train", "--fixed-model-name", "test-model")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-
-
-def test_train_core(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "train",
-        "core",
-        "-c",
-        "config.yml",
-        "-d",
-        "domain.yml",
-        "--stories",
-        "data",
-        "--out",
-        "train_rasa_models",
-        "--fixed-model-name",
-        "rasa-model",
-    )
-
-    assert os.path.exists("train_rasa_models/rasa-model.tar.gz")
-    assert os.path.isfile("train_rasa_models/rasa-model.tar.gz")
-
-
-def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunResult]):
-
-    os.remove("domain.yml")
-    run_in_default_project(
-        "train",
-        "core",
-        "--config",
-        "config.yml",
-        "--domain",
-        "domain1.yml",
-        "--stories",
-        "data",
-        "--out",
-        "train_rasa_models_no_domain",
-        "--fixed-model-name",
-        "rasa-model",
-    )
-
-    assert not os.path.exists("train_rasa_models_no_domain/rasa-model.tar.gz")
-    assert not os.path.isfile("train_rasa_models_no_domain/rasa-model.tar.gz")
-
-
-def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "train",
-        "nlu",
-        "-c",
-        "config.yml",
-        "--nlu",
-        "data/nlu.md",
-        "--out",
-        "train_models",
-    )
-
-    assert os.path.exists("train_models")
-    files = io_utils.list_files("train_models")
-    assert len(files) == 1
-    assert os.path.basename(files[0]).startswith("nlu-")
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is None
-    assert not os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_nlu_persist_nlu_data(
-    run_in_default_project: Callable[..., RunResult]
-) -> None:
-    run_in_default_project(
-        "train",
-        "nlu",
-        "-c",
-        "config.yml",
-        "--nlu",
-        "data/nlu.md",
-        "--out",
-        "train_models",
-        "--persist-nlu-data",
-    )
-
-    assert os.path.exists("train_models")
-    files = io_utils.list_files("train_models")
-    assert len(files) == 1
-    assert os.path.basename(files[0]).startswith("nlu-")
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is not None
-    assert os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_help(run):
-    output = run("train", "--help")
-
-    help_text = """usage: rasa train [-h] [-v] [-vv] [--quiet] [--data DATA [DATA ...]]
-                  [-c CONFIG] [-d DOMAIN] [--out OUT]
-                  [--augmentation AUGMENTATION] [--debug-plots]
-                  [--dump-stories] [--fixed-model-name FIXED_MODEL_NAME]
-                  [--persist-nlu-data] [--force]
-                  {core,nlu} ..."""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_train_nlu_help(run: Callable[..., RunResult]):
-    output = run("train", "nlu", "--help")
-
-    help_text = """usage: rasa train nlu [-h] [-v] [-vv] [--quiet] [-c CONFIG] [--out OUT]
-                      [-u NLU] [--fixed-model-name FIXED_MODEL_NAME]
-                      [--persist-nlu-data]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_train_core_help(run: Callable[..., RunResult]):
-    output = run("train", "core", "--help")
-
-    help_text = """usage: rasa train core [-h] [-v] [-vv] [--quiet] [-s STORIES] [-d DOMAIN]
-                       [-c CONFIG [CONFIG ...]] [--out OUT]
-                       [--augmentation AUGMENTATION] [--debug-plots]
-                       [--dump-stories] [--force]
-                       [--fixed-model-name FIXED_MODEL_NAME]
-                       [--percentages [PERCENTAGES [PERCENTAGES ...]]]
-                       [--runs RUNS]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-@pytest.mark.parametrize(
-    "parameters",
-    [
-        {
-            "config_data": {"language": "en", "pipeline": "supervised"},
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
-            "error": True,
-        },
-        {
-            "config_data": {},
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS,
-            "error": True,
-        },
-        {
-            "config_data": {
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-                "imports": "other-folder",
-            },
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
-            "error": True,
-        },
-        {
-            "config_data": None,
-            "default_config": {
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
-            "error": True,
-        },
-        {
-            "config_data": None,
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS,
-            "error": False,
-        },
-        {
-            "config_data": None,
-            "default_config": {"language": "en", "pipeline": "supervised"},
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
-            "error": True,
-        },
-        {
-            "config_data": None,
-            "default_config": None,
-            "mandatory_keys": CONFIG_MANDATORY_KEYS,
-            "error": True,
-        },
-    ],
-)
-def test_get_valid_config(parameters):
-    import rasa.utils.io
-
-    config_path = None
-    if parameters["config_data"] is not None:
-        config_path = os.path.join(tempfile.mkdtemp(), "config.yml")
-        rasa.utils.io.write_yaml_file(parameters["config_data"], config_path)
-
-    default_config_path = None
-    if parameters["default_config"] is not None:
-        default_config_path = os.path.join(tempfile.mkdtemp(), "default-config.yml")
-        rasa.utils.io.write_yaml_file(parameters["default_config"], default_config_path)
-
-    if parameters["error"]:
-        with pytest.raises(SystemExit):
-            _get_valid_config(config_path, parameters["mandatory_keys"])
-
-    else:
-        config_path = _get_valid_config(
-            config_path, parameters["mandatory_keys"], default_config_path
-        )
-
-        config_data = rasa.utils.io.read_yaml_file(config_path)
-
-        for k in parameters["mandatory_keys"]:
-            assert k in config_data
-
-
-def test_get_valid_config_with_non_existing_file():
-    with pytest.raises(SystemExit):
-        _get_valid_config("non-existing-file.yml", CONFIG_MANDATORY_KEYS)
diff --git a/tests/cli/test_rasa_visualize.py b/tests/cli/test_rasa_visualize.py
deleted file mode 100644
index f69115c9529b..000000000000
--- a/tests/cli/test_rasa_visualize.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_visualize_help(run: Callable[..., RunResult]):
-    output = run("visualize", "--help")
-
-    help_text = """usage: rasa visualize [-h] [-v] [-vv] [--quiet] [-d DOMAIN] [-s STORIES]
-                      [-c CONFIG] [--out OUT] [--max-history MAX_HISTORY]
-                      [-u NLU]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_x.py b/tests/cli/test_rasa_x.py
deleted file mode 100644
index 656ec082b8a4..000000000000
--- a/tests/cli/test_rasa_x.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from pathlib import Path
-
-import pytest
-from typing import Callable, Dict
-from _pytest.pytester import RunResult
-
-
-from aioresponses import aioresponses
-
-import rasa.utils.io as io_utils
-from rasa.cli import x
-from rasa.utils.endpoints import EndpointConfig
-from rasa.core.utils import AvailableEndpoints
-
-
-def test_x_help(run: Callable[..., RunResult]):
-    output = run("x", "--help")
-
-    help_text = """usage: rasa x [-h] [-v] [-vv] [--quiet] [-m MODEL] [--data DATA] [-c CONFIG]
-              [--no-prompt] [--production] [--rasa-x-port RASA_X_PORT]
-              [--config-endpoint CONFIG_ENDPOINT] [--log-file LOG_FILE]
-              [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
-              [--cors [CORS [CORS ...]]] [--enable-api]
-              [--remote-storage REMOTE_STORAGE]
-              [--ssl-certificate SSL_CERTIFICATE] [--ssl-keyfile SSL_KEYFILE]
-              [--ssl-ca-file SSL_CA_FILE] [--ssl-password SSL_PASSWORD]
-              [--credentials CREDENTIALS] [--connector CONNECTOR]
-              [--jwt-secret JWT_SECRET] [--jwt-method JWT_METHOD]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_prepare_credentials_for_rasa_x_if_rasa_channel_not_given(tmpdir: Path):
-    credentials_path = str(tmpdir / "credentials.yml")
-
-    io_utils.write_yaml_file({}, credentials_path)
-
-    tmp_credentials = x._prepare_credentials_for_rasa_x(
-        credentials_path, "http://localhost:5002"
-    )
-
-    actual = io_utils.read_config_file(tmp_credentials)
-
-    assert actual["rasa"]["url"] == "http://localhost:5002"
-
-
-def test_prepare_credentials_if_already_valid(tmpdir: Path):
-    credentials_path = str(tmpdir / "credentials.yml")
-
-    credentials = {
-        "rasa": {"url": "my-custom-url"},
-        "another-channel": {"url": "some-url"},
-    }
-    io_utils.write_yaml_file(credentials, credentials_path)
-
-    x._prepare_credentials_for_rasa_x(credentials_path)
-
-    actual = io_utils.read_config_file(credentials_path)
-
-    assert actual == credentials
-
-
-def test_if_default_endpoint_config_is_valid_in_local_mode():
-    event_broker_endpoint = x._get_event_broker_endpoint(None)
-
-    assert x._is_correct_event_broker(event_broker_endpoint)
-
-
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {"type": "mongo", "url": "mongodb://localhost:27017"},
-        {"type": "sql", "dialect": "postgresql"},
-        {"type": "sql", "dialect": "sqlite", "db": "some.db"},
-    ],
-)
-def test_if_endpoint_config_is_invalid_in_local_mode(kwargs: Dict):
-    config = EndpointConfig(**kwargs)
-    assert not x._is_correct_event_broker(config)
-
-
-def test_overwrite_model_server_url():
-    endpoint_config = EndpointConfig(url="http://testserver:5002/models/default@latest")
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert (
-        endpoints.model.url
-        == "http://localhost/projects/default/models/tags/production"
-    )
-
-
-def test_overwrite_model_server_url_with_no_model_endpoint():
-    endpoints = AvailableEndpoints()
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert (
-        endpoints.model.url
-        == "http://localhost/projects/default/models/tags/production"
-    )
-
-
-def test_reuse_wait_time_between_pulls():
-    test_wait_time = 5
-    endpoint_config = EndpointConfig(
-        url="http://localhost:5002/models/default@latest",
-        wait_time_between_pulls=test_wait_time,
-    )
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    assert endpoints.model.kwargs["wait_time_between_pulls"] == test_wait_time
-
-
-def test_default_wait_time_between_pulls():
-    endpoint_config = EndpointConfig(url="http://localhost:5002/models/default@latest")
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert endpoints.model.kwargs["wait_time_between_pulls"] == 2
-
-
-def test_default_model_server_url():
-    endpoint_config = EndpointConfig()
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert (
-        endpoints.model.url
-        == "http://localhost/projects/default/models/tags/production"
-    )
-
-
-async def test_pull_runtime_config_from_server():
-    config_url = "http://example.com/api/config?token=token"
-    credentials = "rasa: http://example.com:5002/api"
-    endpoint_config = """
-    event_broker:
-        url: http://example.com/event_broker
-        username: some_username
-        password: PASSWORD
-        queue: broker_queue
-    """
-    with aioresponses() as mocked:
-        mocked.get(
-            config_url,
-            payload={"credentials": credentials, "endpoints": endpoint_config},
-        )
-
-        endpoints_path, credentials_path = await x._pull_runtime_config_from_server(
-            config_url, 1, 0
-        )
-
-        assert io_utils.read_file(endpoints_path) == endpoint_config
-        assert io_utils.read_file(credentials_path) == credentials
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
deleted file mode 100644
index 4d270be3964b..000000000000
--- a/tests/cli/test_utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import contextlib
-import logging
-import os
-import pathlib
-import sys
-import tempfile
-
-import pytest
-from _pytest.logging import LogCaptureFixture
-
-import rasa.cli.utils
-from rasa.cli.utils import (
-    parse_last_positional_argument_as_model_path,
-    get_validated_path,
-)
-
-
-@contextlib.contextmanager
-def make_actions_subdir():
-    """Create a subdir called actions to test model argument handling."""
-    with tempfile.TemporaryDirectory() as tempdir:
-        cwd = os.getcwd()
-        os.chdir(tempdir)
-        try:
-            (pathlib.Path(tempdir) / "actions").mkdir()
-            yield
-        finally:
-            os.chdir(cwd)
-
-
-@pytest.mark.parametrize(
-    "argv",
-    [
-        ["rasa", "run"],
-        ["rasa", "run", "actions"],
-        ["rasa", "run", "core"],
-        ["rasa", "interactive", "nlu", "--param", "xy"],
-    ],
-)
-def test_parse_last_positional_argument_as_model_path(argv):
-    with make_actions_subdir():
-        test_model_dir = tempfile.gettempdir()
-        argv.append(test_model_dir)
-
-        sys.argv = argv.copy()
-        parse_last_positional_argument_as_model_path()
-
-        assert sys.argv[-2] == "--model"
-        assert sys.argv[-1] == test_model_dir
-
-
-@pytest.mark.parametrize(
-    "argv",
-    [
-        ["rasa", "run"],
-        ["rasa", "run", "actions"],
-        ["rasa", "run", "core"],
-        ["rasa", "test", "nlu", "--param", "xy", "--model", "test"],
-    ],
-)
-def test_parse_no_positional_model_path_argument(argv):
-    with make_actions_subdir():
-        sys.argv = argv.copy()
-
-        parse_last_positional_argument_as_model_path()
-
-        assert sys.argv == argv
-
-
-def test_validate_invalid_path():
-    with pytest.raises(SystemExit):
-        get_validated_path("test test test", "out", "default")
-
-
-def test_validate_valid_path():
-    tempdir = tempfile.mkdtemp()
-
-    assert get_validated_path(tempdir, "out", "default") == tempdir
-
-
-def test_validate_if_none_is_valid():
-    assert get_validated_path(None, "out", "default", True) is None
-
-
-def test_validate_with_none_if_default_is_valid(caplog: LogCaptureFixture):
-    tempdir = tempfile.mkdtemp()
-
-    with caplog.at_level(logging.WARNING, rasa.cli.utils.logger.name):
-        assert get_validated_path(None, "out", tempdir) == tempdir
-
-    assert caplog.records == []
-
-
-def test_validate_with_invalid_directory_if_default_is_valid(caplog: LogCaptureFixture):
-    tempdir = tempfile.mkdtemp()
-    invalid_directory = "gcfhvjkb"
-    with pytest.warns(UserWarning) as record:
-        assert get_validated_path(invalid_directory, "out", tempdir) == tempdir
-    assert len(record) == 1
-    assert "does not seem to exist" in record[0].message.args[0]
-
-
-def test_print_error_and_exit():
-    with pytest.raises(SystemExit):
-        rasa.cli.utils.print_error_and_exit("")

From b6b7b1074ecb6cbe75b531bee26127f9e8198666 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 22:13:56 +0100
Subject: [PATCH 353/633] return cli tests

---
 tests/cli/__init__.py              |   0
 tests/cli/conftest.py              |  34 +++
 tests/cli/test_cli.py              |  37 +++
 tests/cli/test_rasa_data.py        |  78 +++++
 tests/cli/test_rasa_init.py        |  53 ++++
 tests/cli/test_rasa_interactive.py | 158 ++++++++++
 tests/cli/test_rasa_run.py         |  46 +++
 tests/cli/test_rasa_shell.py       |  34 +++
 tests/cli/test_rasa_test.py        | 194 ++++++++++++
 tests/cli/test_rasa_train.py       | 464 +++++++++++++++++++++++++++++
 tests/cli/test_rasa_visualize.py   |  15 +
 tests/cli/test_rasa_x.py           | 152 ++++++++++
 tests/cli/test_utils.py            | 105 +++++++
 13 files changed, 1370 insertions(+)
 create mode 100644 tests/cli/__init__.py
 create mode 100644 tests/cli/conftest.py
 create mode 100644 tests/cli/test_cli.py
 create mode 100644 tests/cli/test_rasa_data.py
 create mode 100644 tests/cli/test_rasa_init.py
 create mode 100644 tests/cli/test_rasa_interactive.py
 create mode 100644 tests/cli/test_rasa_run.py
 create mode 100644 tests/cli/test_rasa_shell.py
 create mode 100644 tests/cli/test_rasa_test.py
 create mode 100644 tests/cli/test_rasa_train.py
 create mode 100644 tests/cli/test_rasa_visualize.py
 create mode 100644 tests/cli/test_rasa_x.py
 create mode 100644 tests/cli/test_utils.py

diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
new file mode 100644
index 000000000000..b7294eaeadff
--- /dev/null
+++ b/tests/cli/conftest.py
@@ -0,0 +1,34 @@
+from typing import Callable
+import pytest
+import os
+from _pytest.pytester import Testdir, RunResult
+
+
+@pytest.fixture
+def run(testdir: Testdir) -> Callable[..., RunResult]:
+    def do_run(*args):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args)
+
+    return do_run
+
+
+@pytest.fixture
+def run_with_stdin(testdir: Testdir) -> Callable[..., RunResult]:
+    def do_run(*args, stdin):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args, stdin=stdin)
+
+    return do_run
+
+
+@pytest.fixture
+def run_in_default_project(testdir: Testdir) -> Callable[..., RunResult]:
+    os.environ["LOG_LEVEL"] = "ERROR"
+    testdir.run("rasa", "init", "--no-prompt")
+
+    def do_run(*args):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args)
+
+    return do_run
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
new file mode 100644
index 000000000000..dc221349b647
--- /dev/null
+++ b/tests/cli/test_cli.py
@@ -0,0 +1,37 @@
+import pytest
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_cli_start(run: Callable[..., RunResult]):
+    """
+    Measures an average startup time and checks that it
+    does not deviate more than x seconds from 5.
+    """
+    import time
+
+    durations = []
+
+    for i in range(5):
+        start = time.time()
+        run("--help")
+        end = time.time()
+
+        durations.append(end - start)
+
+    avg_duration = sum(durations) / len(durations)
+
+    # When run in parallel, it takes a little longer
+    assert avg_duration - 5 <= 2
+
+
+def test_data_convert_help(run: Callable[..., RunResult]):
+    output = run("--help")
+
+    help_text = """usage: rasa [-h] [--version]
+            {init,run,shell,train,interactive,test,visualize,data,x} ..."""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
new file mode 100644
index 000000000000..3021e9ab12e7
--- /dev/null
+++ b/tests/cli/test_rasa_data.py
@@ -0,0 +1,78 @@
+import os
+import pytest
+from collections import namedtuple
+from typing import Callable
+from _pytest.pytester import RunResult
+from rasa.cli import data
+
+
+def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "data", "split", "nlu", "-u", "data/nlu.md", "--training-fraction", "0.75"
+    )
+
+    assert os.path.exists("train_test_split")
+    assert os.path.exists(os.path.join("train_test_split", "test_data.md"))
+    assert os.path.exists(os.path.join("train_test_split", "training_data.md"))
+
+
+def test_data_convert_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "data",
+        "convert",
+        "nlu",
+        "--data",
+        "data/nlu.md",
+        "--out",
+        "out_nlu_data.json",
+        "-f",
+        "json",
+    )
+
+    assert os.path.exists("out_nlu_data.json")
+
+
+def test_data_split_help(run: Callable[..., RunResult]):
+    output = run("data", "split", "nlu", "--help")
+
+    help_text = """usage: rasa data split nlu [-h] [-v] [-vv] [--quiet] [-u NLU]
+                           [--training-fraction TRAINING_FRACTION]
+                           [--random-seed RANDOM_SEED] [--out OUT]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_data_convert_help(run: Callable[..., RunResult]):
+    output = run("data", "convert", "nlu", "--help")
+
+    help_text = """usage: rasa data convert nlu [-h] [-v] [-vv] [--quiet] --data DATA --out OUT
+                             [-l LANGUAGE] -f {json,md}"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_data_validate_help(run: Callable[..., RunResult]):
+    output = run("data", "validate", "--help")
+
+    help_text = """usage: rasa data validate [-h] [-v] [-vv] [--quiet] [--fail-on-warnings]
+                          [-d DOMAIN] [--data DATA]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_validate_files_exit_early():
+    with pytest.raises(SystemExit) as pytest_e:
+        args = {"domain": "data/test_domains/duplicate_intents.yml", "data": None}
+        data.validate_files(namedtuple("Args", args.keys())(*args.values()))
+
+    assert pytest_e.type == SystemExit
+    assert pytest_e.value.code == 1
diff --git a/tests/cli/test_rasa_init.py b/tests/cli/test_rasa_init.py
new file mode 100644
index 000000000000..c23ba563dda5
--- /dev/null
+++ b/tests/cli/test_rasa_init.py
@@ -0,0 +1,53 @@
+import os
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_init(run: Callable[..., RunResult]):
+    run("init", "--no-prompt", "--quiet")
+
+    assert os.path.exists("actions.py")
+    assert os.path.exists("domain.yml")
+    assert os.path.exists("config.yml")
+    assert os.path.exists("credentials.yml")
+    assert os.path.exists("endpoints.yml")
+    assert os.path.exists("models")
+    assert os.path.exists("data/nlu.md")
+    assert os.path.exists("data/stories.md")
+
+
+def test_init_using_init_dir_option(run: Callable[..., RunResult]):
+    os.makedirs("./workspace")
+    run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
+
+    assert os.path.exists("./workspace/actions.py")
+    assert os.path.exists("./workspace/domain.yml")
+    assert os.path.exists("./workspace/config.yml")
+    assert os.path.exists("./workspace/credentials.yml")
+    assert os.path.exists("./workspace/endpoints.yml")
+    assert os.path.exists("./workspace/models")
+    assert os.path.exists("./workspace/data/nlu.md")
+    assert os.path.exists("./workspace/data/stories.md")
+
+
+def test_not_fount_init_path(run: Callable[..., RunResult]):
+    output = run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
+
+    assert (
+        output.outlines[-1]
+        == "\033[91mProject init path './workspace' not found.\033[0m"
+    )
+
+
+def test_init_help(run: Callable[..., RunResult]):
+    output = run("init", "--help")
+
+    assert (
+        output.outlines[0]
+        == "usage: rasa init [-h] [-v] [-vv] [--quiet] [--no-prompt] [--init-dir INIT_DIR]"
+    )
+
+
+def test_user_asked_to_train_model(run_with_stdin: Callable[..., RunResult]):
+    run_with_stdin("init", stdin=b"\nYN")
+    assert not os.path.exists("models")
diff --git a/tests/cli/test_rasa_interactive.py b/tests/cli/test_rasa_interactive.py
new file mode 100644
index 000000000000..eabc315d1089
--- /dev/null
+++ b/tests/cli/test_rasa_interactive.py
@@ -0,0 +1,158 @@
+import argparse
+import pytest
+from typing import Callable, Text
+from unittest.mock import Mock
+
+from _pytest.monkeypatch import MonkeyPatch
+from _pytest.pytester import RunResult
+
+import rasa
+from rasa.cli import interactive, train
+
+
+def test_interactive_help(run: Callable[..., RunResult]):
+    output = run("interactive", "--help")
+
+    help_text = """usage: rasa interactive [-h] [-v] [-vv] [--quiet] [--e2e] [-m MODEL]
+                        [--data DATA [DATA ...]] [--skip-visualization]
+                        [--endpoints ENDPOINTS] [-c CONFIG] [-d DOMAIN]
+                        [--out OUT] [--augmentation AUGMENTATION]
+                        [--debug-plots] [--dump-stories] [--force]
+                        [--persist-nlu-data]
+                        {core} ... [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_interactive_core_help(run: Callable[..., RunResult]):
+    output = run("interactive", "core", "--help")
+
+    help_text = """usage: rasa interactive core [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
+                             [--skip-visualization] [--endpoints ENDPOINTS]
+                             [-c CONFIG] [-d DOMAIN] [--out OUT]
+                             [--augmentation AUGMENTATION] [--debug-plots]
+                             [--dump-stories]
+                             [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_pass_arguments_to_rasa_train(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    # Create parser
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    # Parse interactive command
+    args = parser.parse_args(["interactive", "--config", default_stack_config])
+    interactive._set_not_required_args(args)
+
+    # Mock actual training
+    mock = Mock()
+    monkeypatch.setattr(rasa, "train", mock.method)
+
+    # If the `Namespace` object does not have all required fields this will throw
+    train.train(args)
+
+    # Assert `train` was actually called
+    mock.method.assert_called_once()
+
+
+def test_train_called_when_no_model_passed(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    args = parser.parse_args(
+        [
+            "interactive",
+            "--config",
+            default_stack_config,
+            "--data",
+            "examples/moodbot/data",
+        ]
+    )
+    interactive._set_not_required_args(args)
+
+    # Mock actual training and interactive learning methods
+    mock = Mock()
+    monkeypatch.setattr(train, "train", mock.train_model)
+    monkeypatch.setattr(
+        interactive, "perform_interactive_learning", mock.perform_interactive_learning
+    )
+
+    interactive.interactive(args)
+    mock.train_model.assert_called_once()
+
+
+def test_train_core_called_when_no_model_passed_and_core(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    args = parser.parse_args(
+        [
+            "interactive",
+            "core",
+            "--config",
+            default_stack_config,
+            "--stories",
+            "examples/moodbot/data/stories.md",
+            "--domain",
+            "examples/moodbot/domain.yml",
+        ]
+    )
+    interactive._set_not_required_args(args)
+
+    # Mock actual training and interactive learning methods
+    mock = Mock()
+    monkeypatch.setattr(train, "train_core", mock.train_core)
+    monkeypatch.setattr(
+        interactive, "perform_interactive_learning", mock.perform_interactive_learning
+    )
+
+    interactive.interactive(args)
+    mock.train_core.assert_called_once()
+
+
+def test_no_interactive_without_core_data(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    args = parser.parse_args(
+        [
+            "interactive",
+            "--config",
+            default_stack_config,
+            "--data",
+            "examples/moodbot/data/nlu.md",
+        ]
+    )
+    interactive._set_not_required_args(args)
+
+    mock = Mock()
+    monkeypatch.setattr(train, "train", mock.train_model)
+    monkeypatch.setattr(
+        interactive, "perform_interactive_learning", mock.perform_interactive_learning
+    )
+
+    with pytest.raises(SystemExit):
+        interactive.interactive(args)
+
+    mock.train_model.assert_not_called()
+    mock.perform_interactive_learning.assert_not_called()
diff --git a/tests/cli/test_rasa_run.py b/tests/cli/test_rasa_run.py
new file mode 100644
index 000000000000..8de8685e91dd
--- /dev/null
+++ b/tests/cli/test_rasa_run.py
@@ -0,0 +1,46 @@
+import os
+import shutil
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_run_does_not_start(run_in_default_project: Callable[..., RunResult]):
+    os.remove("domain.yml")
+    shutil.rmtree("models")
+
+    # the server should not start as no model is configured
+    output = run_in_default_project("run")
+
+    assert "No model found." in output.outlines[0]
+
+
+def test_run_help(run: Callable[..., RunResult]):
+    output = run("run", "--help")
+
+    help_text = """usage: rasa run [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
+                [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
+                [--cors [CORS [CORS ...]]] [--enable-api]
+                [--remote-storage REMOTE_STORAGE]
+                [--ssl-certificate SSL_CERTIFICATE]
+                [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
+                [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
+                [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
+                [--jwt-method JWT_METHOD]
+                {actions} ... [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_run_action_help(run: Callable[..., RunResult]):
+    output = run("run", "actions", "--help")
+
+    help_text = """usage: rasa run actions [-h] [-v] [-vv] [--quiet] [-p PORT]
+                        [--cors [CORS [CORS ...]]] [--actions ACTIONS]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_shell.py b/tests/cli/test_rasa_shell.py
new file mode 100644
index 000000000000..7301db203ec0
--- /dev/null
+++ b/tests/cli/test_rasa_shell.py
@@ -0,0 +1,34 @@
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_shell_help(run: Callable[..., RunResult]):
+    output = run("shell", "--help")
+
+    help_text = """usage: rasa shell [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
+                  [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
+                  [--cors [CORS [CORS ...]]] [--enable-api]
+                  [--remote-storage REMOTE_STORAGE]
+                  [--ssl-certificate SSL_CERTIFICATE]
+                  [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
+                  [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
+                  [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
+                  [--jwt-method JWT_METHOD]
+                  {nlu} ... [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_shell_nlu_help(run: Callable[..., RunResult]):
+    output = run("shell", "nlu", "--help")
+
+    help_text = """usage: rasa shell nlu [-h] [-v] [-vv] [--quiet] [-m MODEL]
+                      [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
new file mode 100644
index 000000000000..50d21c6e6978
--- /dev/null
+++ b/tests/cli/test_rasa_test.py
@@ -0,0 +1,194 @@
+import os
+from shutil import copyfile
+from rasa.constants import DEFAULT_RESULTS_PATH, RESULTS_FILE
+from rasa.utils.io import list_files, write_yaml_file
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_test_core(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "core", "--stories", "data")
+
+    assert os.path.exists("results")
+
+
+def test_test_core_no_plot(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "core", "--no-plot")
+
+    assert not os.path.exists("results/story_confmat.pdf")
+
+
+def test_test(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test")
+
+    assert os.path.exists("results")
+    assert os.path.exists("results/hist.png")
+    assert os.path.exists("results/confmat.png")
+
+
+def test_test_no_plot(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "--no-plot")
+
+    assert not os.path.exists("results/hist.png")
+    assert not os.path.exists("results/confmat.png")
+    assert not os.path.exists("results/story_confmat.pdf")
+
+
+def test_test_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "nlu", "--nlu", "data", "--successes")
+
+    assert os.path.exists("results/hist.png")
+    assert os.path.exists("results/confmat.png")
+    assert os.path.exists("results/intent_successes.json")
+
+
+def test_test_nlu_no_plot(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "nlu", "--no-plot")
+
+    assert not os.path.exists("results/confmat.png")
+    assert not os.path.exists("results/hist.png")
+
+
+def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "test", "nlu", "--cross-validation", "-c", "config.yml", "-f", "2"
+    )
+
+    assert os.path.exists("results/hist.png")
+    assert os.path.exists("results/confmat.png")
+
+
+def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
+    copyfile("config.yml", "nlu-config.yml")
+
+    run_in_default_project(
+        "test", "nlu", "-c", "config.yml", "nlu-config.yml", "--run", "2"
+    )
+
+    assert os.path.exists("results/run_1")
+    assert os.path.exists("results/run_2")
+
+
+def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
+    files = list_files("models")
+    copyfile(files[0], "models/copy-model.tar.gz")
+
+    run_in_default_project(
+        "test",
+        "core",
+        "-m",
+        files[0],
+        "models/copy-model.tar.gz",
+        "--stories",
+        "data/stories.md",
+    )
+
+    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
+
+
+def test_test_core_comparison_after_train(
+    run_in_default_project: Callable[..., RunResult]
+):
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "KerasPolicy"}],
+        },
+        "config_1.yml",
+    )
+
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "MemoizationPolicy"}],
+        },
+        "config_2.yml",
+    )
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config_1.yml",
+        "config_2.yml",
+        "--stories",
+        "data/stories.md",
+        "--runs",
+        "2",
+        "--percentages",
+        "25",
+        "75",
+        "--augmentation",
+        "5",
+        "--out",
+        "comparison_models",
+    )
+
+    assert os.path.exists("comparison_models")
+    assert os.path.exists("comparison_models/run_1")
+    assert os.path.exists("comparison_models/run_2")
+
+    run_in_default_project(
+        "test",
+        "core",
+        "-m",
+        "comparison_models",
+        "--stories",
+        "data/stories",
+        "--evaluate-model-directory",
+    )
+
+    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
+    assert os.path.exists(
+        os.path.join(DEFAULT_RESULTS_PATH, "core_model_comparison_graph.pdf")
+    )
+
+
+def test_test_help(run: Callable[..., RunResult]):
+    output = run("test", "--help")
+
+    help_text = """usage: rasa test [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
+                 [--max-stories MAX_STORIES] [--e2e] [--endpoints ENDPOINTS]
+                 [--fail-on-prediction-errors] [--url URL]
+                 [--evaluate-model-directory] [-u NLU] [--out OUT]
+                 [--successes] [--no-errors] [--histogram HISTOGRAM]
+                 [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
+                 [--cross-validation] [-f FOLDS] [-r RUNS]
+                 [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]
+                 {core,nlu} ..."""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_test_nlu_help(run: Callable[..., RunResult]):
+    output = run("test", "nlu", "--help")
+
+    help_text = """usage: rasa test nlu [-h] [-v] [-vv] [--quiet] [-m MODEL] [-u NLU] [--out OUT]
+                     [--successes] [--no-errors] [--histogram HISTOGRAM]
+                     [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
+                     [--cross-validation] [-f FOLDS] [-r RUNS]
+                     [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_test_core_help(run: Callable[..., RunResult]):
+    output = run("test", "core", "--help")
+
+    help_text = """usage: rasa test core [-h] [-v] [-vv] [--quiet] [-m MODEL [MODEL ...]]
+                      [-s STORIES] [--max-stories MAX_STORIES] [--out OUT]
+                      [--e2e] [--endpoints ENDPOINTS]
+                      [--fail-on-prediction-errors] [--url URL]
+                      [--evaluate-model-directory] [--no-plot]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
new file mode 100644
index 000000000000..1839c77b3ab9
--- /dev/null
+++ b/tests/cli/test_rasa_train.py
@@ -0,0 +1,464 @@
+import os
+import shutil
+import tempfile
+
+import pytest
+from typing import Callable
+from _pytest.pytester import RunResult
+
+from rasa import model
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import training_data
+from rasa.cli.train import _get_valid_config
+from rasa.constants import (
+    CONFIG_MANDATORY_KEYS_CORE,
+    CONFIG_MANDATORY_KEYS,
+    CONFIG_MANDATORY_KEYS_NLU,
+)
+import rasa.utils.io as io_utils
+
+
+def test_train(run_in_default_project: Callable[..., RunResult]):
+    temp_dir = os.getcwd()
+
+    run_in_default_project(
+        "train",
+        "-c",
+        "config.yml",
+        "-d",
+        "domain.yml",
+        "--data",
+        "data",
+        "--out",
+        "train_models",
+        "--fixed-model-name",
+        "test-model",
+    )
+
+    assert os.path.exists(os.path.join(temp_dir, "train_models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is None
+    assert not os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]):
+    temp_dir = os.getcwd()
+
+    run_in_default_project(
+        "train",
+        "-c",
+        "config.yml",
+        "-d",
+        "domain.yml",
+        "--data",
+        "data",
+        "--out",
+        "train_models",
+        "--fixed-model-name",
+        "test-model",
+        "--persist-nlu-data",
+    )
+
+    assert os.path.exists(os.path.join(temp_dir, "train_models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is not None
+    assert os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
+    temp_dir = os.getcwd()
+
+    io_utils.write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "KerasPolicy"}],
+        },
+        "config_1.yml",
+    )
+
+    io_utils.write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "MemoizationPolicy"}],
+        },
+        "config_2.yml",
+    )
+
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config_1.yml",
+        "config_2.yml",
+        "--stories",
+        "data/stories.md",
+        "--out",
+        "core_comparison_results",
+        "--runs",
+        "2",
+        "--percentages",
+        "25",
+        "75",
+        "--augmentation",
+        "5",
+    )
+
+    assert os.path.exists(os.path.join(temp_dir, "core_comparison_results"))
+    run_directories = io_utils.list_subdirectories(
+        os.path.join(temp_dir, "core_comparison_results")
+    )
+    assert len(run_directories) == 2
+    model_files = io_utils.list_files(
+        os.path.join(temp_dir, "core_comparison_results", run_directories[0])
+    )
+    assert len(model_files) == 4
+    assert model_files[0].endswith("tar.gz")
+
+
+def test_train_no_domain_exists(
+    run_in_default_project: Callable[..., RunResult]
+) -> None:
+
+    os.remove("domain.yml")
+    run_in_default_project(
+        "train",
+        "-c",
+        "config.yml",
+        "--data",
+        "data",
+        "--out",
+        "train_models_no_domain",
+        "--fixed-model-name",
+        "nlu-model-only",
+    )
+
+    assert os.path.exists("train_models_no_domain")
+    files = io_utils.list_files("train_models_no_domain")
+    assert len(files) == 1
+
+    trained_model_path = "train_models_no_domain/nlu-model-only.tar.gz"
+    unpacked = model.unpack_model(trained_model_path)
+
+    metadata_path = os.path.join(unpacked, "nlu", "metadata.json")
+    assert os.path.exists(metadata_path)
+
+
+def test_train_skip_on_model_not_changed(
+    run_in_default_project: Callable[..., RunResult]
+):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+
+    file_name = files[0]
+    run_in_default_project("train")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+    assert file_name == files[0]
+
+
+def test_train_force(run_in_default_project):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+
+    run_in_default_project("train", "--force")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 2
+
+
+def test_train_with_only_nlu_data(run_in_default_project):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "data/stories.md"))
+    os.remove(os.path.join(temp_dir, "data/stories.md"))
+    shutil.rmtree(os.path.join(temp_dir, "models"))
+
+    run_in_default_project("train", "--fixed-model-name", "test-model")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+
+
+def test_train_with_only_core_data(run_in_default_project):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "data/nlu.md"))
+    os.remove(os.path.join(temp_dir, "data/nlu.md"))
+    shutil.rmtree(os.path.join(temp_dir, "models"))
+
+    run_in_default_project("train", "--fixed-model-name", "test-model")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+
+
+def test_train_core(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config.yml",
+        "-d",
+        "domain.yml",
+        "--stories",
+        "data",
+        "--out",
+        "train_rasa_models",
+        "--fixed-model-name",
+        "rasa-model",
+    )
+
+    assert os.path.exists("train_rasa_models/rasa-model.tar.gz")
+    assert os.path.isfile("train_rasa_models/rasa-model.tar.gz")
+
+
+def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunResult]):
+
+    os.remove("domain.yml")
+    run_in_default_project(
+        "train",
+        "core",
+        "--config",
+        "config.yml",
+        "--domain",
+        "domain1.yml",
+        "--stories",
+        "data",
+        "--out",
+        "train_rasa_models_no_domain",
+        "--fixed-model-name",
+        "rasa-model",
+    )
+
+    assert not os.path.exists("train_rasa_models_no_domain/rasa-model.tar.gz")
+    assert not os.path.isfile("train_rasa_models_no_domain/rasa-model.tar.gz")
+
+
+def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "train",
+        "nlu",
+        "-c",
+        "config.yml",
+        "--nlu",
+        "data/nlu.md",
+        "--out",
+        "train_models",
+    )
+
+    assert os.path.exists("train_models")
+    files = io_utils.list_files("train_models")
+    assert len(files) == 1
+    assert os.path.basename(files[0]).startswith("nlu-")
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is None
+    assert not os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_nlu_persist_nlu_data(
+    run_in_default_project: Callable[..., RunResult]
+) -> None:
+    run_in_default_project(
+        "train",
+        "nlu",
+        "-c",
+        "config.yml",
+        "--nlu",
+        "data/nlu.md",
+        "--out",
+        "train_models",
+        "--persist-nlu-data",
+    )
+
+    assert os.path.exists("train_models")
+    files = io_utils.list_files("train_models")
+    assert len(files) == 1
+    assert os.path.basename(files[0]).startswith("nlu-")
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is not None
+    assert os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_help(run):
+    output = run("train", "--help")
+
+    help_text = """usage: rasa train [-h] [-v] [-vv] [--quiet] [--data DATA [DATA ...]]
+                  [-c CONFIG] [-d DOMAIN] [--out OUT]
+                  [--augmentation AUGMENTATION] [--debug-plots]
+                  [--dump-stories] [--fixed-model-name FIXED_MODEL_NAME]
+                  [--persist-nlu-data] [--force]
+                  {core,nlu} ..."""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_train_nlu_help(run: Callable[..., RunResult]):
+    output = run("train", "nlu", "--help")
+
+    help_text = """usage: rasa train nlu [-h] [-v] [-vv] [--quiet] [-c CONFIG] [--out OUT]
+                      [-u NLU] [--fixed-model-name FIXED_MODEL_NAME]
+                      [--persist-nlu-data]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_train_core_help(run: Callable[..., RunResult]):
+    output = run("train", "core", "--help")
+
+    help_text = """usage: rasa train core [-h] [-v] [-vv] [--quiet] [-s STORIES] [-d DOMAIN]
+                       [-c CONFIG [CONFIG ...]] [--out OUT]
+                       [--augmentation AUGMENTATION] [--debug-plots]
+                       [--dump-stories] [--force]
+                       [--fixed-model-name FIXED_MODEL_NAME]
+                       [--percentages [PERCENTAGES [PERCENTAGES ...]]]
+                       [--runs RUNS]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+@pytest.mark.parametrize(
+    "parameters",
+    [
+        {
+            "config_data": {"language": "en", "pipeline": "supervised"},
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
+            "error": True,
+        },
+        {
+            "config_data": {},
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS,
+            "error": True,
+        },
+        {
+            "config_data": {
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+                "imports": "other-folder",
+            },
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
+            "error": True,
+        },
+        {
+            "config_data": None,
+            "default_config": {
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
+            "error": True,
+        },
+        {
+            "config_data": None,
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS,
+            "error": False,
+        },
+        {
+            "config_data": None,
+            "default_config": {"language": "en", "pipeline": "supervised"},
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
+            "error": True,
+        },
+        {
+            "config_data": None,
+            "default_config": None,
+            "mandatory_keys": CONFIG_MANDATORY_KEYS,
+            "error": True,
+        },
+    ],
+)
+def test_get_valid_config(parameters):
+    import rasa.utils.io
+
+    config_path = None
+    if parameters["config_data"] is not None:
+        config_path = os.path.join(tempfile.mkdtemp(), "config.yml")
+        rasa.utils.io.write_yaml_file(parameters["config_data"], config_path)
+
+    default_config_path = None
+    if parameters["default_config"] is not None:
+        default_config_path = os.path.join(tempfile.mkdtemp(), "default-config.yml")
+        rasa.utils.io.write_yaml_file(parameters["default_config"], default_config_path)
+
+    if parameters["error"]:
+        with pytest.raises(SystemExit):
+            _get_valid_config(config_path, parameters["mandatory_keys"])
+
+    else:
+        config_path = _get_valid_config(
+            config_path, parameters["mandatory_keys"], default_config_path
+        )
+
+        config_data = rasa.utils.io.read_yaml_file(config_path)
+
+        for k in parameters["mandatory_keys"]:
+            assert k in config_data
+
+
+def test_get_valid_config_with_non_existing_file():
+    with pytest.raises(SystemExit):
+        _get_valid_config("non-existing-file.yml", CONFIG_MANDATORY_KEYS)
diff --git a/tests/cli/test_rasa_visualize.py b/tests/cli/test_rasa_visualize.py
new file mode 100644
index 000000000000..f69115c9529b
--- /dev/null
+++ b/tests/cli/test_rasa_visualize.py
@@ -0,0 +1,15 @@
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_visualize_help(run: Callable[..., RunResult]):
+    output = run("visualize", "--help")
+
+    help_text = """usage: rasa visualize [-h] [-v] [-vv] [--quiet] [-d DOMAIN] [-s STORIES]
+                      [-c CONFIG] [--out OUT] [--max-history MAX_HISTORY]
+                      [-u NLU]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_x.py b/tests/cli/test_rasa_x.py
new file mode 100644
index 000000000000..656ec082b8a4
--- /dev/null
+++ b/tests/cli/test_rasa_x.py
@@ -0,0 +1,152 @@
+from pathlib import Path
+
+import pytest
+from typing import Callable, Dict
+from _pytest.pytester import RunResult
+
+
+from aioresponses import aioresponses
+
+import rasa.utils.io as io_utils
+from rasa.cli import x
+from rasa.utils.endpoints import EndpointConfig
+from rasa.core.utils import AvailableEndpoints
+
+
+def test_x_help(run: Callable[..., RunResult]):
+    output = run("x", "--help")
+
+    help_text = """usage: rasa x [-h] [-v] [-vv] [--quiet] [-m MODEL] [--data DATA] [-c CONFIG]
+              [--no-prompt] [--production] [--rasa-x-port RASA_X_PORT]
+              [--config-endpoint CONFIG_ENDPOINT] [--log-file LOG_FILE]
+              [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
+              [--cors [CORS [CORS ...]]] [--enable-api]
+              [--remote-storage REMOTE_STORAGE]
+              [--ssl-certificate SSL_CERTIFICATE] [--ssl-keyfile SSL_KEYFILE]
+              [--ssl-ca-file SSL_CA_FILE] [--ssl-password SSL_PASSWORD]
+              [--credentials CREDENTIALS] [--connector CONNECTOR]
+              [--jwt-secret JWT_SECRET] [--jwt-method JWT_METHOD]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_prepare_credentials_for_rasa_x_if_rasa_channel_not_given(tmpdir: Path):
+    credentials_path = str(tmpdir / "credentials.yml")
+
+    io_utils.write_yaml_file({}, credentials_path)
+
+    tmp_credentials = x._prepare_credentials_for_rasa_x(
+        credentials_path, "http://localhost:5002"
+    )
+
+    actual = io_utils.read_config_file(tmp_credentials)
+
+    assert actual["rasa"]["url"] == "http://localhost:5002"
+
+
+def test_prepare_credentials_if_already_valid(tmpdir: Path):
+    credentials_path = str(tmpdir / "credentials.yml")
+
+    credentials = {
+        "rasa": {"url": "my-custom-url"},
+        "another-channel": {"url": "some-url"},
+    }
+    io_utils.write_yaml_file(credentials, credentials_path)
+
+    x._prepare_credentials_for_rasa_x(credentials_path)
+
+    actual = io_utils.read_config_file(credentials_path)
+
+    assert actual == credentials
+
+
+def test_if_default_endpoint_config_is_valid_in_local_mode():
+    event_broker_endpoint = x._get_event_broker_endpoint(None)
+
+    assert x._is_correct_event_broker(event_broker_endpoint)
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"type": "mongo", "url": "mongodb://localhost:27017"},
+        {"type": "sql", "dialect": "postgresql"},
+        {"type": "sql", "dialect": "sqlite", "db": "some.db"},
+    ],
+)
+def test_if_endpoint_config_is_invalid_in_local_mode(kwargs: Dict):
+    config = EndpointConfig(**kwargs)
+    assert not x._is_correct_event_broker(config)
+
+
+def test_overwrite_model_server_url():
+    endpoint_config = EndpointConfig(url="http://testserver:5002/models/default@latest")
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert (
+        endpoints.model.url
+        == "http://localhost/projects/default/models/tags/production"
+    )
+
+
+def test_overwrite_model_server_url_with_no_model_endpoint():
+    endpoints = AvailableEndpoints()
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert (
+        endpoints.model.url
+        == "http://localhost/projects/default/models/tags/production"
+    )
+
+
+def test_reuse_wait_time_between_pulls():
+    test_wait_time = 5
+    endpoint_config = EndpointConfig(
+        url="http://localhost:5002/models/default@latest",
+        wait_time_between_pulls=test_wait_time,
+    )
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    assert endpoints.model.kwargs["wait_time_between_pulls"] == test_wait_time
+
+
+def test_default_wait_time_between_pulls():
+    endpoint_config = EndpointConfig(url="http://localhost:5002/models/default@latest")
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert endpoints.model.kwargs["wait_time_between_pulls"] == 2
+
+
+def test_default_model_server_url():
+    endpoint_config = EndpointConfig()
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert (
+        endpoints.model.url
+        == "http://localhost/projects/default/models/tags/production"
+    )
+
+
+async def test_pull_runtime_config_from_server():
+    config_url = "http://example.com/api/config?token=token"
+    credentials = "rasa: http://example.com:5002/api"
+    endpoint_config = """
+    event_broker:
+        url: http://example.com/event_broker
+        username: some_username
+        password: PASSWORD
+        queue: broker_queue
+    """
+    with aioresponses() as mocked:
+        mocked.get(
+            config_url,
+            payload={"credentials": credentials, "endpoints": endpoint_config},
+        )
+
+        endpoints_path, credentials_path = await x._pull_runtime_config_from_server(
+            config_url, 1, 0
+        )
+
+        assert io_utils.read_file(endpoints_path) == endpoint_config
+        assert io_utils.read_file(credentials_path) == credentials
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
new file mode 100644
index 000000000000..4d270be3964b
--- /dev/null
+++ b/tests/cli/test_utils.py
@@ -0,0 +1,105 @@
+import contextlib
+import logging
+import os
+import pathlib
+import sys
+import tempfile
+
+import pytest
+from _pytest.logging import LogCaptureFixture
+
+import rasa.cli.utils
+from rasa.cli.utils import (
+    parse_last_positional_argument_as_model_path,
+    get_validated_path,
+)
+
+
+@contextlib.contextmanager
+def make_actions_subdir():
+    """Create a subdir called actions to test model argument handling."""
+    with tempfile.TemporaryDirectory() as tempdir:
+        cwd = os.getcwd()
+        os.chdir(tempdir)
+        try:
+            (pathlib.Path(tempdir) / "actions").mkdir()
+            yield
+        finally:
+            os.chdir(cwd)
+
+
+@pytest.mark.parametrize(
+    "argv",
+    [
+        ["rasa", "run"],
+        ["rasa", "run", "actions"],
+        ["rasa", "run", "core"],
+        ["rasa", "interactive", "nlu", "--param", "xy"],
+    ],
+)
+def test_parse_last_positional_argument_as_model_path(argv):
+    with make_actions_subdir():
+        test_model_dir = tempfile.gettempdir()
+        argv.append(test_model_dir)
+
+        sys.argv = argv.copy()
+        parse_last_positional_argument_as_model_path()
+
+        assert sys.argv[-2] == "--model"
+        assert sys.argv[-1] == test_model_dir
+
+
+@pytest.mark.parametrize(
+    "argv",
+    [
+        ["rasa", "run"],
+        ["rasa", "run", "actions"],
+        ["rasa", "run", "core"],
+        ["rasa", "test", "nlu", "--param", "xy", "--model", "test"],
+    ],
+)
+def test_parse_no_positional_model_path_argument(argv):
+    with make_actions_subdir():
+        sys.argv = argv.copy()
+
+        parse_last_positional_argument_as_model_path()
+
+        assert sys.argv == argv
+
+
+def test_validate_invalid_path():
+    with pytest.raises(SystemExit):
+        get_validated_path("test test test", "out", "default")
+
+
+def test_validate_valid_path():
+    tempdir = tempfile.mkdtemp()
+
+    assert get_validated_path(tempdir, "out", "default") == tempdir
+
+
+def test_validate_if_none_is_valid():
+    assert get_validated_path(None, "out", "default", True) is None
+
+
+def test_validate_with_none_if_default_is_valid(caplog: LogCaptureFixture):
+    tempdir = tempfile.mkdtemp()
+
+    with caplog.at_level(logging.WARNING, rasa.cli.utils.logger.name):
+        assert get_validated_path(None, "out", tempdir) == tempdir
+
+    assert caplog.records == []
+
+
+def test_validate_with_invalid_directory_if_default_is_valid(caplog: LogCaptureFixture):
+    tempdir = tempfile.mkdtemp()
+    invalid_directory = "gcfhvjkb"
+    with pytest.warns(UserWarning) as record:
+        assert get_validated_path(invalid_directory, "out", tempdir) == tempdir
+    assert len(record) == 1
+    assert "does not seem to exist" in record[0].message.args[0]
+
+
+def test_print_error_and_exit():
+    with pytest.raises(SystemExit):
+        rasa.cli.utils.print_error_and_exit("")

From 50d87a32465f2d12723610fa0b385490af8f0e3a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 22:14:32 +0100
Subject: [PATCH 354/633] TEMPORARY delete cli tests

---
 tests/cli/__init__.py              |   0
 tests/cli/conftest.py              |  34 ---
 tests/cli/test_cli.py              |  37 ---
 tests/cli/test_rasa_data.py        |  78 -----
 tests/cli/test_rasa_init.py        |  53 ----
 tests/cli/test_rasa_interactive.py | 158 ----------
 tests/cli/test_rasa_run.py         |  46 ---
 tests/cli/test_rasa_shell.py       |  34 ---
 tests/cli/test_rasa_test.py        | 194 ------------
 tests/cli/test_rasa_train.py       | 464 -----------------------------
 tests/cli/test_rasa_visualize.py   |  15 -
 tests/cli/test_rasa_x.py           | 152 ----------
 tests/cli/test_utils.py            | 105 -------
 13 files changed, 1370 deletions(-)
 delete mode 100644 tests/cli/__init__.py
 delete mode 100644 tests/cli/conftest.py
 delete mode 100644 tests/cli/test_cli.py
 delete mode 100644 tests/cli/test_rasa_data.py
 delete mode 100644 tests/cli/test_rasa_init.py
 delete mode 100644 tests/cli/test_rasa_interactive.py
 delete mode 100644 tests/cli/test_rasa_run.py
 delete mode 100644 tests/cli/test_rasa_shell.py
 delete mode 100644 tests/cli/test_rasa_test.py
 delete mode 100644 tests/cli/test_rasa_train.py
 delete mode 100644 tests/cli/test_rasa_visualize.py
 delete mode 100644 tests/cli/test_rasa_x.py
 delete mode 100644 tests/cli/test_utils.py

diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
deleted file mode 100644
index b7294eaeadff..000000000000
--- a/tests/cli/conftest.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from typing import Callable
-import pytest
-import os
-from _pytest.pytester import Testdir, RunResult
-
-
-@pytest.fixture
-def run(testdir: Testdir) -> Callable[..., RunResult]:
-    def do_run(*args):
-        args = ["rasa"] + list(args)
-        return testdir.run(*args)
-
-    return do_run
-
-
-@pytest.fixture
-def run_with_stdin(testdir: Testdir) -> Callable[..., RunResult]:
-    def do_run(*args, stdin):
-        args = ["rasa"] + list(args)
-        return testdir.run(*args, stdin=stdin)
-
-    return do_run
-
-
-@pytest.fixture
-def run_in_default_project(testdir: Testdir) -> Callable[..., RunResult]:
-    os.environ["LOG_LEVEL"] = "ERROR"
-    testdir.run("rasa", "init", "--no-prompt")
-
-    def do_run(*args):
-        args = ["rasa"] + list(args)
-        return testdir.run(*args)
-
-    return do_run
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
deleted file mode 100644
index dc221349b647..000000000000
--- a/tests/cli/test_cli.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pytest
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_cli_start(run: Callable[..., RunResult]):
-    """
-    Measures an average startup time and checks that it
-    does not deviate more than x seconds from 5.
-    """
-    import time
-
-    durations = []
-
-    for i in range(5):
-        start = time.time()
-        run("--help")
-        end = time.time()
-
-        durations.append(end - start)
-
-    avg_duration = sum(durations) / len(durations)
-
-    # When run in parallel, it takes a little longer
-    assert avg_duration - 5 <= 2
-
-
-def test_data_convert_help(run: Callable[..., RunResult]):
-    output = run("--help")
-
-    help_text = """usage: rasa [-h] [--version]
-            {init,run,shell,train,interactive,test,visualize,data,x} ..."""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
deleted file mode 100644
index 3021e9ab12e7..000000000000
--- a/tests/cli/test_rasa_data.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-import pytest
-from collections import namedtuple
-from typing import Callable
-from _pytest.pytester import RunResult
-from rasa.cli import data
-
-
-def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "data", "split", "nlu", "-u", "data/nlu.md", "--training-fraction", "0.75"
-    )
-
-    assert os.path.exists("train_test_split")
-    assert os.path.exists(os.path.join("train_test_split", "test_data.md"))
-    assert os.path.exists(os.path.join("train_test_split", "training_data.md"))
-
-
-def test_data_convert_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "data",
-        "convert",
-        "nlu",
-        "--data",
-        "data/nlu.md",
-        "--out",
-        "out_nlu_data.json",
-        "-f",
-        "json",
-    )
-
-    assert os.path.exists("out_nlu_data.json")
-
-
-def test_data_split_help(run: Callable[..., RunResult]):
-    output = run("data", "split", "nlu", "--help")
-
-    help_text = """usage: rasa data split nlu [-h] [-v] [-vv] [--quiet] [-u NLU]
-                           [--training-fraction TRAINING_FRACTION]
-                           [--random-seed RANDOM_SEED] [--out OUT]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_data_convert_help(run: Callable[..., RunResult]):
-    output = run("data", "convert", "nlu", "--help")
-
-    help_text = """usage: rasa data convert nlu [-h] [-v] [-vv] [--quiet] --data DATA --out OUT
-                             [-l LANGUAGE] -f {json,md}"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_data_validate_help(run: Callable[..., RunResult]):
-    output = run("data", "validate", "--help")
-
-    help_text = """usage: rasa data validate [-h] [-v] [-vv] [--quiet] [--fail-on-warnings]
-                          [-d DOMAIN] [--data DATA]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_validate_files_exit_early():
-    with pytest.raises(SystemExit) as pytest_e:
-        args = {"domain": "data/test_domains/duplicate_intents.yml", "data": None}
-        data.validate_files(namedtuple("Args", args.keys())(*args.values()))
-
-    assert pytest_e.type == SystemExit
-    assert pytest_e.value.code == 1
diff --git a/tests/cli/test_rasa_init.py b/tests/cli/test_rasa_init.py
deleted file mode 100644
index c23ba563dda5..000000000000
--- a/tests/cli/test_rasa_init.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_init(run: Callable[..., RunResult]):
-    run("init", "--no-prompt", "--quiet")
-
-    assert os.path.exists("actions.py")
-    assert os.path.exists("domain.yml")
-    assert os.path.exists("config.yml")
-    assert os.path.exists("credentials.yml")
-    assert os.path.exists("endpoints.yml")
-    assert os.path.exists("models")
-    assert os.path.exists("data/nlu.md")
-    assert os.path.exists("data/stories.md")
-
-
-def test_init_using_init_dir_option(run: Callable[..., RunResult]):
-    os.makedirs("./workspace")
-    run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
-
-    assert os.path.exists("./workspace/actions.py")
-    assert os.path.exists("./workspace/domain.yml")
-    assert os.path.exists("./workspace/config.yml")
-    assert os.path.exists("./workspace/credentials.yml")
-    assert os.path.exists("./workspace/endpoints.yml")
-    assert os.path.exists("./workspace/models")
-    assert os.path.exists("./workspace/data/nlu.md")
-    assert os.path.exists("./workspace/data/stories.md")
-
-
-def test_not_fount_init_path(run: Callable[..., RunResult]):
-    output = run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
-
-    assert (
-        output.outlines[-1]
-        == "\033[91mProject init path './workspace' not found.\033[0m"
-    )
-
-
-def test_init_help(run: Callable[..., RunResult]):
-    output = run("init", "--help")
-
-    assert (
-        output.outlines[0]
-        == "usage: rasa init [-h] [-v] [-vv] [--quiet] [--no-prompt] [--init-dir INIT_DIR]"
-    )
-
-
-def test_user_asked_to_train_model(run_with_stdin: Callable[..., RunResult]):
-    run_with_stdin("init", stdin=b"\nYN")
-    assert not os.path.exists("models")
diff --git a/tests/cli/test_rasa_interactive.py b/tests/cli/test_rasa_interactive.py
deleted file mode 100644
index eabc315d1089..000000000000
--- a/tests/cli/test_rasa_interactive.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import argparse
-import pytest
-from typing import Callable, Text
-from unittest.mock import Mock
-
-from _pytest.monkeypatch import MonkeyPatch
-from _pytest.pytester import RunResult
-
-import rasa
-from rasa.cli import interactive, train
-
-
-def test_interactive_help(run: Callable[..., RunResult]):
-    output = run("interactive", "--help")
-
-    help_text = """usage: rasa interactive [-h] [-v] [-vv] [--quiet] [--e2e] [-m MODEL]
-                        [--data DATA [DATA ...]] [--skip-visualization]
-                        [--endpoints ENDPOINTS] [-c CONFIG] [-d DOMAIN]
-                        [--out OUT] [--augmentation AUGMENTATION]
-                        [--debug-plots] [--dump-stories] [--force]
-                        [--persist-nlu-data]
-                        {core} ... [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_interactive_core_help(run: Callable[..., RunResult]):
-    output = run("interactive", "core", "--help")
-
-    help_text = """usage: rasa interactive core [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
-                             [--skip-visualization] [--endpoints ENDPOINTS]
-                             [-c CONFIG] [-d DOMAIN] [--out OUT]
-                             [--augmentation AUGMENTATION] [--debug-plots]
-                             [--dump-stories]
-                             [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_pass_arguments_to_rasa_train(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    # Create parser
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    # Parse interactive command
-    args = parser.parse_args(["interactive", "--config", default_stack_config])
-    interactive._set_not_required_args(args)
-
-    # Mock actual training
-    mock = Mock()
-    monkeypatch.setattr(rasa, "train", mock.method)
-
-    # If the `Namespace` object does not have all required fields this will throw
-    train.train(args)
-
-    # Assert `train` was actually called
-    mock.method.assert_called_once()
-
-
-def test_train_called_when_no_model_passed(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    args = parser.parse_args(
-        [
-            "interactive",
-            "--config",
-            default_stack_config,
-            "--data",
-            "examples/moodbot/data",
-        ]
-    )
-    interactive._set_not_required_args(args)
-
-    # Mock actual training and interactive learning methods
-    mock = Mock()
-    monkeypatch.setattr(train, "train", mock.train_model)
-    monkeypatch.setattr(
-        interactive, "perform_interactive_learning", mock.perform_interactive_learning
-    )
-
-    interactive.interactive(args)
-    mock.train_model.assert_called_once()
-
-
-def test_train_core_called_when_no_model_passed_and_core(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    args = parser.parse_args(
-        [
-            "interactive",
-            "core",
-            "--config",
-            default_stack_config,
-            "--stories",
-            "examples/moodbot/data/stories.md",
-            "--domain",
-            "examples/moodbot/domain.yml",
-        ]
-    )
-    interactive._set_not_required_args(args)
-
-    # Mock actual training and interactive learning methods
-    mock = Mock()
-    monkeypatch.setattr(train, "train_core", mock.train_core)
-    monkeypatch.setattr(
-        interactive, "perform_interactive_learning", mock.perform_interactive_learning
-    )
-
-    interactive.interactive(args)
-    mock.train_core.assert_called_once()
-
-
-def test_no_interactive_without_core_data(
-    default_stack_config: Text, monkeypatch: MonkeyPatch
-) -> None:
-    parser = argparse.ArgumentParser()
-    sub_parser = parser.add_subparsers()
-    interactive.add_subparser(sub_parser, [])
-
-    args = parser.parse_args(
-        [
-            "interactive",
-            "--config",
-            default_stack_config,
-            "--data",
-            "examples/moodbot/data/nlu.md",
-        ]
-    )
-    interactive._set_not_required_args(args)
-
-    mock = Mock()
-    monkeypatch.setattr(train, "train", mock.train_model)
-    monkeypatch.setattr(
-        interactive, "perform_interactive_learning", mock.perform_interactive_learning
-    )
-
-    with pytest.raises(SystemExit):
-        interactive.interactive(args)
-
-    mock.train_model.assert_not_called()
-    mock.perform_interactive_learning.assert_not_called()
diff --git a/tests/cli/test_rasa_run.py b/tests/cli/test_rasa_run.py
deleted file mode 100644
index 8de8685e91dd..000000000000
--- a/tests/cli/test_rasa_run.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import shutil
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_run_does_not_start(run_in_default_project: Callable[..., RunResult]):
-    os.remove("domain.yml")
-    shutil.rmtree("models")
-
-    # the server should not start as no model is configured
-    output = run_in_default_project("run")
-
-    assert "No model found." in output.outlines[0]
-
-
-def test_run_help(run: Callable[..., RunResult]):
-    output = run("run", "--help")
-
-    help_text = """usage: rasa run [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
-                [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
-                [--cors [CORS [CORS ...]]] [--enable-api]
-                [--remote-storage REMOTE_STORAGE]
-                [--ssl-certificate SSL_CERTIFICATE]
-                [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
-                [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
-                [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
-                [--jwt-method JWT_METHOD]
-                {actions} ... [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_run_action_help(run: Callable[..., RunResult]):
-    output = run("run", "actions", "--help")
-
-    help_text = """usage: rasa run actions [-h] [-v] [-vv] [--quiet] [-p PORT]
-                        [--cors [CORS [CORS ...]]] [--actions ACTIONS]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_shell.py b/tests/cli/test_rasa_shell.py
deleted file mode 100644
index 7301db203ec0..000000000000
--- a/tests/cli/test_rasa_shell.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_shell_help(run: Callable[..., RunResult]):
-    output = run("shell", "--help")
-
-    help_text = """usage: rasa shell [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
-                  [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
-                  [--cors [CORS [CORS ...]]] [--enable-api]
-                  [--remote-storage REMOTE_STORAGE]
-                  [--ssl-certificate SSL_CERTIFICATE]
-                  [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
-                  [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
-                  [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
-                  [--jwt-method JWT_METHOD]
-                  {nlu} ... [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_shell_nlu_help(run: Callable[..., RunResult]):
-    output = run("shell", "nlu", "--help")
-
-    help_text = """usage: rasa shell nlu [-h] [-v] [-vv] [--quiet] [-m MODEL]
-                      [model-as-positional-argument]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
deleted file mode 100644
index 50d21c6e6978..000000000000
--- a/tests/cli/test_rasa_test.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import os
-from shutil import copyfile
-from rasa.constants import DEFAULT_RESULTS_PATH, RESULTS_FILE
-from rasa.utils.io import list_files, write_yaml_file
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_test_core(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "core", "--stories", "data")
-
-    assert os.path.exists("results")
-
-
-def test_test_core_no_plot(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "core", "--no-plot")
-
-    assert not os.path.exists("results/story_confmat.pdf")
-
-
-def test_test(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test")
-
-    assert os.path.exists("results")
-    assert os.path.exists("results/hist.png")
-    assert os.path.exists("results/confmat.png")
-
-
-def test_test_no_plot(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "--no-plot")
-
-    assert not os.path.exists("results/hist.png")
-    assert not os.path.exists("results/confmat.png")
-    assert not os.path.exists("results/story_confmat.pdf")
-
-
-def test_test_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "nlu", "--nlu", "data", "--successes")
-
-    assert os.path.exists("results/hist.png")
-    assert os.path.exists("results/confmat.png")
-    assert os.path.exists("results/intent_successes.json")
-
-
-def test_test_nlu_no_plot(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project("test", "nlu", "--no-plot")
-
-    assert not os.path.exists("results/confmat.png")
-    assert not os.path.exists("results/hist.png")
-
-
-def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "test", "nlu", "--cross-validation", "-c", "config.yml", "-f", "2"
-    )
-
-    assert os.path.exists("results/hist.png")
-    assert os.path.exists("results/confmat.png")
-
-
-def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
-    copyfile("config.yml", "nlu-config.yml")
-
-    run_in_default_project(
-        "test", "nlu", "-c", "config.yml", "nlu-config.yml", "--run", "2"
-    )
-
-    assert os.path.exists("results/run_1")
-    assert os.path.exists("results/run_2")
-
-
-def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
-    files = list_files("models")
-    copyfile(files[0], "models/copy-model.tar.gz")
-
-    run_in_default_project(
-        "test",
-        "core",
-        "-m",
-        files[0],
-        "models/copy-model.tar.gz",
-        "--stories",
-        "data/stories.md",
-    )
-
-    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
-
-
-def test_test_core_comparison_after_train(
-    run_in_default_project: Callable[..., RunResult]
-):
-    write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "KerasPolicy"}],
-        },
-        "config_1.yml",
-    )
-
-    write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "MemoizationPolicy"}],
-        },
-        "config_2.yml",
-    )
-    run_in_default_project(
-        "train",
-        "core",
-        "-c",
-        "config_1.yml",
-        "config_2.yml",
-        "--stories",
-        "data/stories.md",
-        "--runs",
-        "2",
-        "--percentages",
-        "25",
-        "75",
-        "--augmentation",
-        "5",
-        "--out",
-        "comparison_models",
-    )
-
-    assert os.path.exists("comparison_models")
-    assert os.path.exists("comparison_models/run_1")
-    assert os.path.exists("comparison_models/run_2")
-
-    run_in_default_project(
-        "test",
-        "core",
-        "-m",
-        "comparison_models",
-        "--stories",
-        "data/stories",
-        "--evaluate-model-directory",
-    )
-
-    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
-    assert os.path.exists(
-        os.path.join(DEFAULT_RESULTS_PATH, "core_model_comparison_graph.pdf")
-    )
-
-
-def test_test_help(run: Callable[..., RunResult]):
-    output = run("test", "--help")
-
-    help_text = """usage: rasa test [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
-                 [--max-stories MAX_STORIES] [--e2e] [--endpoints ENDPOINTS]
-                 [--fail-on-prediction-errors] [--url URL]
-                 [--evaluate-model-directory] [-u NLU] [--out OUT]
-                 [--successes] [--no-errors] [--histogram HISTOGRAM]
-                 [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
-                 [--cross-validation] [-f FOLDS] [-r RUNS]
-                 [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]
-                 {core,nlu} ..."""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_test_nlu_help(run: Callable[..., RunResult]):
-    output = run("test", "nlu", "--help")
-
-    help_text = """usage: rasa test nlu [-h] [-v] [-vv] [--quiet] [-m MODEL] [-u NLU] [--out OUT]
-                     [--successes] [--no-errors] [--histogram HISTOGRAM]
-                     [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
-                     [--cross-validation] [-f FOLDS] [-r RUNS]
-                     [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_test_core_help(run: Callable[..., RunResult]):
-    output = run("test", "core", "--help")
-
-    help_text = """usage: rasa test core [-h] [-v] [-vv] [--quiet] [-m MODEL [MODEL ...]]
-                      [-s STORIES] [--max-stories MAX_STORIES] [--out OUT]
-                      [--e2e] [--endpoints ENDPOINTS]
-                      [--fail-on-prediction-errors] [--url URL]
-                      [--evaluate-model-directory] [--no-plot]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
deleted file mode 100644
index 1839c77b3ab9..000000000000
--- a/tests/cli/test_rasa_train.py
+++ /dev/null
@@ -1,464 +0,0 @@
-import os
-import shutil
-import tempfile
-
-import pytest
-from typing import Callable
-from _pytest.pytester import RunResult
-
-from rasa import model
-from rasa.nlu.model import Metadata
-from rasa.nlu.training_data import training_data
-from rasa.cli.train import _get_valid_config
-from rasa.constants import (
-    CONFIG_MANDATORY_KEYS_CORE,
-    CONFIG_MANDATORY_KEYS,
-    CONFIG_MANDATORY_KEYS_NLU,
-)
-import rasa.utils.io as io_utils
-
-
-def test_train(run_in_default_project: Callable[..., RunResult]):
-    temp_dir = os.getcwd()
-
-    run_in_default_project(
-        "train",
-        "-c",
-        "config.yml",
-        "-d",
-        "domain.yml",
-        "--data",
-        "data",
-        "--out",
-        "train_models",
-        "--fixed-model-name",
-        "test-model",
-    )
-
-    assert os.path.exists(os.path.join(temp_dir, "train_models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is None
-    assert not os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]):
-    temp_dir = os.getcwd()
-
-    run_in_default_project(
-        "train",
-        "-c",
-        "config.yml",
-        "-d",
-        "domain.yml",
-        "--data",
-        "data",
-        "--out",
-        "train_models",
-        "--fixed-model-name",
-        "test-model",
-        "--persist-nlu-data",
-    )
-
-    assert os.path.exists(os.path.join(temp_dir, "train_models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is not None
-    assert os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
-    temp_dir = os.getcwd()
-
-    io_utils.write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "KerasPolicy"}],
-        },
-        "config_1.yml",
-    )
-
-    io_utils.write_yaml_file(
-        {
-            "language": "en",
-            "pipeline": "supervised_embeddings",
-            "policies": [{"name": "MemoizationPolicy"}],
-        },
-        "config_2.yml",
-    )
-
-    run_in_default_project(
-        "train",
-        "core",
-        "-c",
-        "config_1.yml",
-        "config_2.yml",
-        "--stories",
-        "data/stories.md",
-        "--out",
-        "core_comparison_results",
-        "--runs",
-        "2",
-        "--percentages",
-        "25",
-        "75",
-        "--augmentation",
-        "5",
-    )
-
-    assert os.path.exists(os.path.join(temp_dir, "core_comparison_results"))
-    run_directories = io_utils.list_subdirectories(
-        os.path.join(temp_dir, "core_comparison_results")
-    )
-    assert len(run_directories) == 2
-    model_files = io_utils.list_files(
-        os.path.join(temp_dir, "core_comparison_results", run_directories[0])
-    )
-    assert len(model_files) == 4
-    assert model_files[0].endswith("tar.gz")
-
-
-def test_train_no_domain_exists(
-    run_in_default_project: Callable[..., RunResult]
-) -> None:
-
-    os.remove("domain.yml")
-    run_in_default_project(
-        "train",
-        "-c",
-        "config.yml",
-        "--data",
-        "data",
-        "--out",
-        "train_models_no_domain",
-        "--fixed-model-name",
-        "nlu-model-only",
-    )
-
-    assert os.path.exists("train_models_no_domain")
-    files = io_utils.list_files("train_models_no_domain")
-    assert len(files) == 1
-
-    trained_model_path = "train_models_no_domain/nlu-model-only.tar.gz"
-    unpacked = model.unpack_model(trained_model_path)
-
-    metadata_path = os.path.join(unpacked, "nlu", "metadata.json")
-    assert os.path.exists(metadata_path)
-
-
-def test_train_skip_on_model_not_changed(
-    run_in_default_project: Callable[..., RunResult]
-):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-
-    file_name = files[0]
-    run_in_default_project("train")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-    assert file_name == files[0]
-
-
-def test_train_force(run_in_default_project):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-
-    run_in_default_project("train", "--force")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 2
-
-
-def test_train_with_only_nlu_data(run_in_default_project):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "data/stories.md"))
-    os.remove(os.path.join(temp_dir, "data/stories.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
-
-    run_in_default_project("train", "--fixed-model-name", "test-model")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-
-
-def test_train_with_only_core_data(run_in_default_project):
-    temp_dir = os.getcwd()
-
-    assert os.path.exists(os.path.join(temp_dir, "data/nlu.md"))
-    os.remove(os.path.join(temp_dir, "data/nlu.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
-
-    run_in_default_project("train", "--fixed-model-name", "test-model")
-
-    assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = io_utils.list_files(os.path.join(temp_dir, "models"))
-    assert len(files) == 1
-    assert os.path.basename(files[0]) == "test-model.tar.gz"
-
-
-def test_train_core(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "train",
-        "core",
-        "-c",
-        "config.yml",
-        "-d",
-        "domain.yml",
-        "--stories",
-        "data",
-        "--out",
-        "train_rasa_models",
-        "--fixed-model-name",
-        "rasa-model",
-    )
-
-    assert os.path.exists("train_rasa_models/rasa-model.tar.gz")
-    assert os.path.isfile("train_rasa_models/rasa-model.tar.gz")
-
-
-def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunResult]):
-
-    os.remove("domain.yml")
-    run_in_default_project(
-        "train",
-        "core",
-        "--config",
-        "config.yml",
-        "--domain",
-        "domain1.yml",
-        "--stories",
-        "data",
-        "--out",
-        "train_rasa_models_no_domain",
-        "--fixed-model-name",
-        "rasa-model",
-    )
-
-    assert not os.path.exists("train_rasa_models_no_domain/rasa-model.tar.gz")
-    assert not os.path.isfile("train_rasa_models_no_domain/rasa-model.tar.gz")
-
-
-def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
-        "train",
-        "nlu",
-        "-c",
-        "config.yml",
-        "--nlu",
-        "data/nlu.md",
-        "--out",
-        "train_models",
-    )
-
-    assert os.path.exists("train_models")
-    files = io_utils.list_files("train_models")
-    assert len(files) == 1
-    assert os.path.basename(files[0]).startswith("nlu-")
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is None
-    assert not os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_nlu_persist_nlu_data(
-    run_in_default_project: Callable[..., RunResult]
-) -> None:
-    run_in_default_project(
-        "train",
-        "nlu",
-        "-c",
-        "config.yml",
-        "--nlu",
-        "data/nlu.md",
-        "--out",
-        "train_models",
-        "--persist-nlu-data",
-    )
-
-    assert os.path.exists("train_models")
-    files = io_utils.list_files("train_models")
-    assert len(files) == 1
-    assert os.path.basename(files[0]).startswith("nlu-")
-    model_dir = model.get_model("train_models")
-    assert model_dir is not None
-    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
-    assert metadata.get("training_data") is not None
-    assert os.path.exists(
-        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
-    )
-
-
-def test_train_help(run):
-    output = run("train", "--help")
-
-    help_text = """usage: rasa train [-h] [-v] [-vv] [--quiet] [--data DATA [DATA ...]]
-                  [-c CONFIG] [-d DOMAIN] [--out OUT]
-                  [--augmentation AUGMENTATION] [--debug-plots]
-                  [--dump-stories] [--fixed-model-name FIXED_MODEL_NAME]
-                  [--persist-nlu-data] [--force]
-                  {core,nlu} ..."""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_train_nlu_help(run: Callable[..., RunResult]):
-    output = run("train", "nlu", "--help")
-
-    help_text = """usage: rasa train nlu [-h] [-v] [-vv] [--quiet] [-c CONFIG] [--out OUT]
-                      [-u NLU] [--fixed-model-name FIXED_MODEL_NAME]
-                      [--persist-nlu-data]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_train_core_help(run: Callable[..., RunResult]):
-    output = run("train", "core", "--help")
-
-    help_text = """usage: rasa train core [-h] [-v] [-vv] [--quiet] [-s STORIES] [-d DOMAIN]
-                       [-c CONFIG [CONFIG ...]] [--out OUT]
-                       [--augmentation AUGMENTATION] [--debug-plots]
-                       [--dump-stories] [--force]
-                       [--fixed-model-name FIXED_MODEL_NAME]
-                       [--percentages [PERCENTAGES [PERCENTAGES ...]]]
-                       [--runs RUNS]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-@pytest.mark.parametrize(
-    "parameters",
-    [
-        {
-            "config_data": {"language": "en", "pipeline": "supervised"},
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
-            "error": True,
-        },
-        {
-            "config_data": {},
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS,
-            "error": True,
-        },
-        {
-            "config_data": {
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-                "imports": "other-folder",
-            },
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
-            "error": True,
-        },
-        {
-            "config_data": None,
-            "default_config": {
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
-            "error": True,
-        },
-        {
-            "config_data": None,
-            "default_config": {
-                "language": "en",
-                "pipeline": "supervised",
-                "policies": ["KerasPolicy", "FallbackPolicy"],
-            },
-            "mandatory_keys": CONFIG_MANDATORY_KEYS,
-            "error": False,
-        },
-        {
-            "config_data": None,
-            "default_config": {"language": "en", "pipeline": "supervised"},
-            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
-            "error": True,
-        },
-        {
-            "config_data": None,
-            "default_config": None,
-            "mandatory_keys": CONFIG_MANDATORY_KEYS,
-            "error": True,
-        },
-    ],
-)
-def test_get_valid_config(parameters):
-    import rasa.utils.io
-
-    config_path = None
-    if parameters["config_data"] is not None:
-        config_path = os.path.join(tempfile.mkdtemp(), "config.yml")
-        rasa.utils.io.write_yaml_file(parameters["config_data"], config_path)
-
-    default_config_path = None
-    if parameters["default_config"] is not None:
-        default_config_path = os.path.join(tempfile.mkdtemp(), "default-config.yml")
-        rasa.utils.io.write_yaml_file(parameters["default_config"], default_config_path)
-
-    if parameters["error"]:
-        with pytest.raises(SystemExit):
-            _get_valid_config(config_path, parameters["mandatory_keys"])
-
-    else:
-        config_path = _get_valid_config(
-            config_path, parameters["mandatory_keys"], default_config_path
-        )
-
-        config_data = rasa.utils.io.read_yaml_file(config_path)
-
-        for k in parameters["mandatory_keys"]:
-            assert k in config_data
-
-
-def test_get_valid_config_with_non_existing_file():
-    with pytest.raises(SystemExit):
-        _get_valid_config("non-existing-file.yml", CONFIG_MANDATORY_KEYS)
diff --git a/tests/cli/test_rasa_visualize.py b/tests/cli/test_rasa_visualize.py
deleted file mode 100644
index f69115c9529b..000000000000
--- a/tests/cli/test_rasa_visualize.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from typing import Callable
-from _pytest.pytester import RunResult
-
-
-def test_visualize_help(run: Callable[..., RunResult]):
-    output = run("visualize", "--help")
-
-    help_text = """usage: rasa visualize [-h] [-v] [-vv] [--quiet] [-d DOMAIN] [-s STORIES]
-                      [-c CONFIG] [--out OUT] [--max-history MAX_HISTORY]
-                      [-u NLU]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_x.py b/tests/cli/test_rasa_x.py
deleted file mode 100644
index 656ec082b8a4..000000000000
--- a/tests/cli/test_rasa_x.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from pathlib import Path
-
-import pytest
-from typing import Callable, Dict
-from _pytest.pytester import RunResult
-
-
-from aioresponses import aioresponses
-
-import rasa.utils.io as io_utils
-from rasa.cli import x
-from rasa.utils.endpoints import EndpointConfig
-from rasa.core.utils import AvailableEndpoints
-
-
-def test_x_help(run: Callable[..., RunResult]):
-    output = run("x", "--help")
-
-    help_text = """usage: rasa x [-h] [-v] [-vv] [--quiet] [-m MODEL] [--data DATA] [-c CONFIG]
-              [--no-prompt] [--production] [--rasa-x-port RASA_X_PORT]
-              [--config-endpoint CONFIG_ENDPOINT] [--log-file LOG_FILE]
-              [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
-              [--cors [CORS [CORS ...]]] [--enable-api]
-              [--remote-storage REMOTE_STORAGE]
-              [--ssl-certificate SSL_CERTIFICATE] [--ssl-keyfile SSL_KEYFILE]
-              [--ssl-ca-file SSL_CA_FILE] [--ssl-password SSL_PASSWORD]
-              [--credentials CREDENTIALS] [--connector CONNECTOR]
-              [--jwt-secret JWT_SECRET] [--jwt-method JWT_METHOD]"""
-
-    lines = help_text.split("\n")
-
-    for i, line in enumerate(lines):
-        assert output.outlines[i] == line
-
-
-def test_prepare_credentials_for_rasa_x_if_rasa_channel_not_given(tmpdir: Path):
-    credentials_path = str(tmpdir / "credentials.yml")
-
-    io_utils.write_yaml_file({}, credentials_path)
-
-    tmp_credentials = x._prepare_credentials_for_rasa_x(
-        credentials_path, "http://localhost:5002"
-    )
-
-    actual = io_utils.read_config_file(tmp_credentials)
-
-    assert actual["rasa"]["url"] == "http://localhost:5002"
-
-
-def test_prepare_credentials_if_already_valid(tmpdir: Path):
-    credentials_path = str(tmpdir / "credentials.yml")
-
-    credentials = {
-        "rasa": {"url": "my-custom-url"},
-        "another-channel": {"url": "some-url"},
-    }
-    io_utils.write_yaml_file(credentials, credentials_path)
-
-    x._prepare_credentials_for_rasa_x(credentials_path)
-
-    actual = io_utils.read_config_file(credentials_path)
-
-    assert actual == credentials
-
-
-def test_if_default_endpoint_config_is_valid_in_local_mode():
-    event_broker_endpoint = x._get_event_broker_endpoint(None)
-
-    assert x._is_correct_event_broker(event_broker_endpoint)
-
-
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {"type": "mongo", "url": "mongodb://localhost:27017"},
-        {"type": "sql", "dialect": "postgresql"},
-        {"type": "sql", "dialect": "sqlite", "db": "some.db"},
-    ],
-)
-def test_if_endpoint_config_is_invalid_in_local_mode(kwargs: Dict):
-    config = EndpointConfig(**kwargs)
-    assert not x._is_correct_event_broker(config)
-
-
-def test_overwrite_model_server_url():
-    endpoint_config = EndpointConfig(url="http://testserver:5002/models/default@latest")
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert (
-        endpoints.model.url
-        == "http://localhost/projects/default/models/tags/production"
-    )
-
-
-def test_overwrite_model_server_url_with_no_model_endpoint():
-    endpoints = AvailableEndpoints()
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert (
-        endpoints.model.url
-        == "http://localhost/projects/default/models/tags/production"
-    )
-
-
-def test_reuse_wait_time_between_pulls():
-    test_wait_time = 5
-    endpoint_config = EndpointConfig(
-        url="http://localhost:5002/models/default@latest",
-        wait_time_between_pulls=test_wait_time,
-    )
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    assert endpoints.model.kwargs["wait_time_between_pulls"] == test_wait_time
-
-
-def test_default_wait_time_between_pulls():
-    endpoint_config = EndpointConfig(url="http://localhost:5002/models/default@latest")
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert endpoints.model.kwargs["wait_time_between_pulls"] == 2
-
-
-def test_default_model_server_url():
-    endpoint_config = EndpointConfig()
-    endpoints = AvailableEndpoints(model=endpoint_config)
-    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
-    assert (
-        endpoints.model.url
-        == "http://localhost/projects/default/models/tags/production"
-    )
-
-
-async def test_pull_runtime_config_from_server():
-    config_url = "http://example.com/api/config?token=token"
-    credentials = "rasa: http://example.com:5002/api"
-    endpoint_config = """
-    event_broker:
-        url: http://example.com/event_broker
-        username: some_username
-        password: PASSWORD
-        queue: broker_queue
-    """
-    with aioresponses() as mocked:
-        mocked.get(
-            config_url,
-            payload={"credentials": credentials, "endpoints": endpoint_config},
-        )
-
-        endpoints_path, credentials_path = await x._pull_runtime_config_from_server(
-            config_url, 1, 0
-        )
-
-        assert io_utils.read_file(endpoints_path) == endpoint_config
-        assert io_utils.read_file(credentials_path) == credentials
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
deleted file mode 100644
index 4d270be3964b..000000000000
--- a/tests/cli/test_utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import contextlib
-import logging
-import os
-import pathlib
-import sys
-import tempfile
-
-import pytest
-from _pytest.logging import LogCaptureFixture
-
-import rasa.cli.utils
-from rasa.cli.utils import (
-    parse_last_positional_argument_as_model_path,
-    get_validated_path,
-)
-
-
-@contextlib.contextmanager
-def make_actions_subdir():
-    """Create a subdir called actions to test model argument handling."""
-    with tempfile.TemporaryDirectory() as tempdir:
-        cwd = os.getcwd()
-        os.chdir(tempdir)
-        try:
-            (pathlib.Path(tempdir) / "actions").mkdir()
-            yield
-        finally:
-            os.chdir(cwd)
-
-
-@pytest.mark.parametrize(
-    "argv",
-    [
-        ["rasa", "run"],
-        ["rasa", "run", "actions"],
-        ["rasa", "run", "core"],
-        ["rasa", "interactive", "nlu", "--param", "xy"],
-    ],
-)
-def test_parse_last_positional_argument_as_model_path(argv):
-    with make_actions_subdir():
-        test_model_dir = tempfile.gettempdir()
-        argv.append(test_model_dir)
-
-        sys.argv = argv.copy()
-        parse_last_positional_argument_as_model_path()
-
-        assert sys.argv[-2] == "--model"
-        assert sys.argv[-1] == test_model_dir
-
-
-@pytest.mark.parametrize(
-    "argv",
-    [
-        ["rasa", "run"],
-        ["rasa", "run", "actions"],
-        ["rasa", "run", "core"],
-        ["rasa", "test", "nlu", "--param", "xy", "--model", "test"],
-    ],
-)
-def test_parse_no_positional_model_path_argument(argv):
-    with make_actions_subdir():
-        sys.argv = argv.copy()
-
-        parse_last_positional_argument_as_model_path()
-
-        assert sys.argv == argv
-
-
-def test_validate_invalid_path():
-    with pytest.raises(SystemExit):
-        get_validated_path("test test test", "out", "default")
-
-
-def test_validate_valid_path():
-    tempdir = tempfile.mkdtemp()
-
-    assert get_validated_path(tempdir, "out", "default") == tempdir
-
-
-def test_validate_if_none_is_valid():
-    assert get_validated_path(None, "out", "default", True) is None
-
-
-def test_validate_with_none_if_default_is_valid(caplog: LogCaptureFixture):
-    tempdir = tempfile.mkdtemp()
-
-    with caplog.at_level(logging.WARNING, rasa.cli.utils.logger.name):
-        assert get_validated_path(None, "out", tempdir) == tempdir
-
-    assert caplog.records == []
-
-
-def test_validate_with_invalid_directory_if_default_is_valid(caplog: LogCaptureFixture):
-    tempdir = tempfile.mkdtemp()
-    invalid_directory = "gcfhvjkb"
-    with pytest.warns(UserWarning) as record:
-        assert get_validated_path(invalid_directory, "out", tempdir) == tempdir
-    assert len(record) == 1
-    assert "does not seem to exist" in record[0].message.args[0]
-
-
-def test_print_error_and_exit():
-    with pytest.raises(SystemExit):
-        rasa.cli.utils.print_error_and_exit("")

From 1fd5cb69662dec40b0dae8ec4b598bbf99669db6 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 22:15:18 +0100
Subject: [PATCH 355/633] return cli tests

---
 tests/cli/__init__.py              |   0
 tests/cli/conftest.py              |  34 +++
 tests/cli/test_cli.py              |  37 +++
 tests/cli/test_rasa_data.py        |  78 +++++
 tests/cli/test_rasa_init.py        |  53 ++++
 tests/cli/test_rasa_interactive.py | 158 ++++++++++
 tests/cli/test_rasa_run.py         |  46 +++
 tests/cli/test_rasa_shell.py       |  34 +++
 tests/cli/test_rasa_test.py        | 194 ++++++++++++
 tests/cli/test_rasa_train.py       | 464 +++++++++++++++++++++++++++++
 tests/cli/test_rasa_visualize.py   |  15 +
 tests/cli/test_rasa_x.py           | 152 ++++++++++
 tests/cli/test_utils.py            | 105 +++++++
 13 files changed, 1370 insertions(+)
 create mode 100644 tests/cli/__init__.py
 create mode 100644 tests/cli/conftest.py
 create mode 100644 tests/cli/test_cli.py
 create mode 100644 tests/cli/test_rasa_data.py
 create mode 100644 tests/cli/test_rasa_init.py
 create mode 100644 tests/cli/test_rasa_interactive.py
 create mode 100644 tests/cli/test_rasa_run.py
 create mode 100644 tests/cli/test_rasa_shell.py
 create mode 100644 tests/cli/test_rasa_test.py
 create mode 100644 tests/cli/test_rasa_train.py
 create mode 100644 tests/cli/test_rasa_visualize.py
 create mode 100644 tests/cli/test_rasa_x.py
 create mode 100644 tests/cli/test_utils.py

diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
new file mode 100644
index 000000000000..b7294eaeadff
--- /dev/null
+++ b/tests/cli/conftest.py
@@ -0,0 +1,34 @@
+from typing import Callable
+import pytest
+import os
+from _pytest.pytester import Testdir, RunResult
+
+
+@pytest.fixture
+def run(testdir: Testdir) -> Callable[..., RunResult]:
+    def do_run(*args):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args)
+
+    return do_run
+
+
+@pytest.fixture
+def run_with_stdin(testdir: Testdir) -> Callable[..., RunResult]:
+    def do_run(*args, stdin):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args, stdin=stdin)
+
+    return do_run
+
+
+@pytest.fixture
+def run_in_default_project(testdir: Testdir) -> Callable[..., RunResult]:
+    os.environ["LOG_LEVEL"] = "ERROR"
+    testdir.run("rasa", "init", "--no-prompt")
+
+    def do_run(*args):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args)
+
+    return do_run
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
new file mode 100644
index 000000000000..dc221349b647
--- /dev/null
+++ b/tests/cli/test_cli.py
@@ -0,0 +1,37 @@
+import pytest
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_cli_start(run: Callable[..., RunResult]):
+    """
+    Measures an average startup time and checks that it
+    does not deviate more than x seconds from 5.
+    """
+    import time
+
+    durations = []
+
+    for i in range(5):
+        start = time.time()
+        run("--help")
+        end = time.time()
+
+        durations.append(end - start)
+
+    avg_duration = sum(durations) / len(durations)
+
+    # When run in parallel, it takes a little longer
+    assert avg_duration - 5 <= 2
+
+
+def test_data_convert_help(run: Callable[..., RunResult]):
+    output = run("--help")
+
+    help_text = """usage: rasa [-h] [--version]
+            {init,run,shell,train,interactive,test,visualize,data,x} ..."""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
new file mode 100644
index 000000000000..3021e9ab12e7
--- /dev/null
+++ b/tests/cli/test_rasa_data.py
@@ -0,0 +1,78 @@
+import os
+import pytest
+from collections import namedtuple
+from typing import Callable
+from _pytest.pytester import RunResult
+from rasa.cli import data
+
+
+def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "data", "split", "nlu", "-u", "data/nlu.md", "--training-fraction", "0.75"
+    )
+
+    assert os.path.exists("train_test_split")
+    assert os.path.exists(os.path.join("train_test_split", "test_data.md"))
+    assert os.path.exists(os.path.join("train_test_split", "training_data.md"))
+
+
+def test_data_convert_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "data",
+        "convert",
+        "nlu",
+        "--data",
+        "data/nlu.md",
+        "--out",
+        "out_nlu_data.json",
+        "-f",
+        "json",
+    )
+
+    assert os.path.exists("out_nlu_data.json")
+
+
+def test_data_split_help(run: Callable[..., RunResult]):
+    output = run("data", "split", "nlu", "--help")
+
+    help_text = """usage: rasa data split nlu [-h] [-v] [-vv] [--quiet] [-u NLU]
+                           [--training-fraction TRAINING_FRACTION]
+                           [--random-seed RANDOM_SEED] [--out OUT]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_data_convert_help(run: Callable[..., RunResult]):
+    output = run("data", "convert", "nlu", "--help")
+
+    help_text = """usage: rasa data convert nlu [-h] [-v] [-vv] [--quiet] --data DATA --out OUT
+                             [-l LANGUAGE] -f {json,md}"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_data_validate_help(run: Callable[..., RunResult]):
+    output = run("data", "validate", "--help")
+
+    help_text = """usage: rasa data validate [-h] [-v] [-vv] [--quiet] [--fail-on-warnings]
+                          [-d DOMAIN] [--data DATA]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_validate_files_exit_early():
+    with pytest.raises(SystemExit) as pytest_e:
+        args = {"domain": "data/test_domains/duplicate_intents.yml", "data": None}
+        data.validate_files(namedtuple("Args", args.keys())(*args.values()))
+
+    assert pytest_e.type == SystemExit
+    assert pytest_e.value.code == 1
diff --git a/tests/cli/test_rasa_init.py b/tests/cli/test_rasa_init.py
new file mode 100644
index 000000000000..c23ba563dda5
--- /dev/null
+++ b/tests/cli/test_rasa_init.py
@@ -0,0 +1,53 @@
+import os
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_init(run: Callable[..., RunResult]):
+    run("init", "--no-prompt", "--quiet")
+
+    assert os.path.exists("actions.py")
+    assert os.path.exists("domain.yml")
+    assert os.path.exists("config.yml")
+    assert os.path.exists("credentials.yml")
+    assert os.path.exists("endpoints.yml")
+    assert os.path.exists("models")
+    assert os.path.exists("data/nlu.md")
+    assert os.path.exists("data/stories.md")
+
+
+def test_init_using_init_dir_option(run: Callable[..., RunResult]):
+    os.makedirs("./workspace")
+    run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
+
+    assert os.path.exists("./workspace/actions.py")
+    assert os.path.exists("./workspace/domain.yml")
+    assert os.path.exists("./workspace/config.yml")
+    assert os.path.exists("./workspace/credentials.yml")
+    assert os.path.exists("./workspace/endpoints.yml")
+    assert os.path.exists("./workspace/models")
+    assert os.path.exists("./workspace/data/nlu.md")
+    assert os.path.exists("./workspace/data/stories.md")
+
+
+def test_not_fount_init_path(run: Callable[..., RunResult]):
+    output = run("init", "--no-prompt", "--quiet", "--init-dir", "./workspace")
+
+    assert (
+        output.outlines[-1]
+        == "\033[91mProject init path './workspace' not found.\033[0m"
+    )
+
+
+def test_init_help(run: Callable[..., RunResult]):
+    output = run("init", "--help")
+
+    assert (
+        output.outlines[0]
+        == "usage: rasa init [-h] [-v] [-vv] [--quiet] [--no-prompt] [--init-dir INIT_DIR]"
+    )
+
+
+def test_user_asked_to_train_model(run_with_stdin: Callable[..., RunResult]):
+    run_with_stdin("init", stdin=b"\nYN")
+    assert not os.path.exists("models")
diff --git a/tests/cli/test_rasa_interactive.py b/tests/cli/test_rasa_interactive.py
new file mode 100644
index 000000000000..eabc315d1089
--- /dev/null
+++ b/tests/cli/test_rasa_interactive.py
@@ -0,0 +1,158 @@
+import argparse
+import pytest
+from typing import Callable, Text
+from unittest.mock import Mock
+
+from _pytest.monkeypatch import MonkeyPatch
+from _pytest.pytester import RunResult
+
+import rasa
+from rasa.cli import interactive, train
+
+
+def test_interactive_help(run: Callable[..., RunResult]):
+    output = run("interactive", "--help")
+
+    help_text = """usage: rasa interactive [-h] [-v] [-vv] [--quiet] [--e2e] [-m MODEL]
+                        [--data DATA [DATA ...]] [--skip-visualization]
+                        [--endpoints ENDPOINTS] [-c CONFIG] [-d DOMAIN]
+                        [--out OUT] [--augmentation AUGMENTATION]
+                        [--debug-plots] [--dump-stories] [--force]
+                        [--persist-nlu-data]
+                        {core} ... [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_interactive_core_help(run: Callable[..., RunResult]):
+    output = run("interactive", "core", "--help")
+
+    help_text = """usage: rasa interactive core [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
+                             [--skip-visualization] [--endpoints ENDPOINTS]
+                             [-c CONFIG] [-d DOMAIN] [--out OUT]
+                             [--augmentation AUGMENTATION] [--debug-plots]
+                             [--dump-stories]
+                             [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_pass_arguments_to_rasa_train(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    # Create parser
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    # Parse interactive command
+    args = parser.parse_args(["interactive", "--config", default_stack_config])
+    interactive._set_not_required_args(args)
+
+    # Mock actual training
+    mock = Mock()
+    monkeypatch.setattr(rasa, "train", mock.method)
+
+    # If the `Namespace` object does not have all required fields this will throw
+    train.train(args)
+
+    # Assert `train` was actually called
+    mock.method.assert_called_once()
+
+
+def test_train_called_when_no_model_passed(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    args = parser.parse_args(
+        [
+            "interactive",
+            "--config",
+            default_stack_config,
+            "--data",
+            "examples/moodbot/data",
+        ]
+    )
+    interactive._set_not_required_args(args)
+
+    # Mock actual training and interactive learning methods
+    mock = Mock()
+    monkeypatch.setattr(train, "train", mock.train_model)
+    monkeypatch.setattr(
+        interactive, "perform_interactive_learning", mock.perform_interactive_learning
+    )
+
+    interactive.interactive(args)
+    mock.train_model.assert_called_once()
+
+
+def test_train_core_called_when_no_model_passed_and_core(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    args = parser.parse_args(
+        [
+            "interactive",
+            "core",
+            "--config",
+            default_stack_config,
+            "--stories",
+            "examples/moodbot/data/stories.md",
+            "--domain",
+            "examples/moodbot/domain.yml",
+        ]
+    )
+    interactive._set_not_required_args(args)
+
+    # Mock actual training and interactive learning methods
+    mock = Mock()
+    monkeypatch.setattr(train, "train_core", mock.train_core)
+    monkeypatch.setattr(
+        interactive, "perform_interactive_learning", mock.perform_interactive_learning
+    )
+
+    interactive.interactive(args)
+    mock.train_core.assert_called_once()
+
+
+def test_no_interactive_without_core_data(
+    default_stack_config: Text, monkeypatch: MonkeyPatch
+) -> None:
+    parser = argparse.ArgumentParser()
+    sub_parser = parser.add_subparsers()
+    interactive.add_subparser(sub_parser, [])
+
+    args = parser.parse_args(
+        [
+            "interactive",
+            "--config",
+            default_stack_config,
+            "--data",
+            "examples/moodbot/data/nlu.md",
+        ]
+    )
+    interactive._set_not_required_args(args)
+
+    mock = Mock()
+    monkeypatch.setattr(train, "train", mock.train_model)
+    monkeypatch.setattr(
+        interactive, "perform_interactive_learning", mock.perform_interactive_learning
+    )
+
+    with pytest.raises(SystemExit):
+        interactive.interactive(args)
+
+    mock.train_model.assert_not_called()
+    mock.perform_interactive_learning.assert_not_called()
diff --git a/tests/cli/test_rasa_run.py b/tests/cli/test_rasa_run.py
new file mode 100644
index 000000000000..8de8685e91dd
--- /dev/null
+++ b/tests/cli/test_rasa_run.py
@@ -0,0 +1,46 @@
+import os
+import shutil
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_run_does_not_start(run_in_default_project: Callable[..., RunResult]):
+    os.remove("domain.yml")
+    shutil.rmtree("models")
+
+    # the server should not start as no model is configured
+    output = run_in_default_project("run")
+
+    assert "No model found." in output.outlines[0]
+
+
+def test_run_help(run: Callable[..., RunResult]):
+    output = run("run", "--help")
+
+    help_text = """usage: rasa run [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
+                [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
+                [--cors [CORS [CORS ...]]] [--enable-api]
+                [--remote-storage REMOTE_STORAGE]
+                [--ssl-certificate SSL_CERTIFICATE]
+                [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
+                [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
+                [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
+                [--jwt-method JWT_METHOD]
+                {actions} ... [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_run_action_help(run: Callable[..., RunResult]):
+    output = run("run", "actions", "--help")
+
+    help_text = """usage: rasa run actions [-h] [-v] [-vv] [--quiet] [-p PORT]
+                        [--cors [CORS [CORS ...]]] [--actions ACTIONS]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_shell.py b/tests/cli/test_rasa_shell.py
new file mode 100644
index 000000000000..7301db203ec0
--- /dev/null
+++ b/tests/cli/test_rasa_shell.py
@@ -0,0 +1,34 @@
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_shell_help(run: Callable[..., RunResult]):
+    output = run("shell", "--help")
+
+    help_text = """usage: rasa shell [-h] [-v] [-vv] [--quiet] [-m MODEL] [--log-file LOG_FILE]
+                  [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
+                  [--cors [CORS [CORS ...]]] [--enable-api]
+                  [--remote-storage REMOTE_STORAGE]
+                  [--ssl-certificate SSL_CERTIFICATE]
+                  [--ssl-keyfile SSL_KEYFILE] [--ssl-ca-file SSL_CA_FILE]
+                  [--ssl-password SSL_PASSWORD] [--credentials CREDENTIALS]
+                  [--connector CONNECTOR] [--jwt-secret JWT_SECRET]
+                  [--jwt-method JWT_METHOD]
+                  {nlu} ... [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_shell_nlu_help(run: Callable[..., RunResult]):
+    output = run("shell", "nlu", "--help")
+
+    help_text = """usage: rasa shell nlu [-h] [-v] [-vv] [--quiet] [-m MODEL]
+                      [model-as-positional-argument]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
new file mode 100644
index 000000000000..50d21c6e6978
--- /dev/null
+++ b/tests/cli/test_rasa_test.py
@@ -0,0 +1,194 @@
+import os
+from shutil import copyfile
+from rasa.constants import DEFAULT_RESULTS_PATH, RESULTS_FILE
+from rasa.utils.io import list_files, write_yaml_file
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_test_core(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "core", "--stories", "data")
+
+    assert os.path.exists("results")
+
+
+def test_test_core_no_plot(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "core", "--no-plot")
+
+    assert not os.path.exists("results/story_confmat.pdf")
+
+
+def test_test(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test")
+
+    assert os.path.exists("results")
+    assert os.path.exists("results/hist.png")
+    assert os.path.exists("results/confmat.png")
+
+
+def test_test_no_plot(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "--no-plot")
+
+    assert not os.path.exists("results/hist.png")
+    assert not os.path.exists("results/confmat.png")
+    assert not os.path.exists("results/story_confmat.pdf")
+
+
+def test_test_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "nlu", "--nlu", "data", "--successes")
+
+    assert os.path.exists("results/hist.png")
+    assert os.path.exists("results/confmat.png")
+    assert os.path.exists("results/intent_successes.json")
+
+
+def test_test_nlu_no_plot(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project("test", "nlu", "--no-plot")
+
+    assert not os.path.exists("results/confmat.png")
+    assert not os.path.exists("results/hist.png")
+
+
+def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "test", "nlu", "--cross-validation", "-c", "config.yml", "-f", "2"
+    )
+
+    assert os.path.exists("results/hist.png")
+    assert os.path.exists("results/confmat.png")
+
+
+def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
+    copyfile("config.yml", "nlu-config.yml")
+
+    run_in_default_project(
+        "test", "nlu", "-c", "config.yml", "nlu-config.yml", "--run", "2"
+    )
+
+    assert os.path.exists("results/run_1")
+    assert os.path.exists("results/run_2")
+
+
+def test_test_core_comparison(run_in_default_project: Callable[..., RunResult]):
+    files = list_files("models")
+    copyfile(files[0], "models/copy-model.tar.gz")
+
+    run_in_default_project(
+        "test",
+        "core",
+        "-m",
+        files[0],
+        "models/copy-model.tar.gz",
+        "--stories",
+        "data/stories.md",
+    )
+
+    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
+
+
+def test_test_core_comparison_after_train(
+    run_in_default_project: Callable[..., RunResult]
+):
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "KerasPolicy"}],
+        },
+        "config_1.yml",
+    )
+
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "MemoizationPolicy"}],
+        },
+        "config_2.yml",
+    )
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config_1.yml",
+        "config_2.yml",
+        "--stories",
+        "data/stories.md",
+        "--runs",
+        "2",
+        "--percentages",
+        "25",
+        "75",
+        "--augmentation",
+        "5",
+        "--out",
+        "comparison_models",
+    )
+
+    assert os.path.exists("comparison_models")
+    assert os.path.exists("comparison_models/run_1")
+    assert os.path.exists("comparison_models/run_2")
+
+    run_in_default_project(
+        "test",
+        "core",
+        "-m",
+        "comparison_models",
+        "--stories",
+        "data/stories",
+        "--evaluate-model-directory",
+    )
+
+    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
+    assert os.path.exists(
+        os.path.join(DEFAULT_RESULTS_PATH, "core_model_comparison_graph.pdf")
+    )
+
+
+def test_test_help(run: Callable[..., RunResult]):
+    output = run("test", "--help")
+
+    help_text = """usage: rasa test [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
+                 [--max-stories MAX_STORIES] [--e2e] [--endpoints ENDPOINTS]
+                 [--fail-on-prediction-errors] [--url URL]
+                 [--evaluate-model-directory] [-u NLU] [--out OUT]
+                 [--successes] [--no-errors] [--histogram HISTOGRAM]
+                 [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
+                 [--cross-validation] [-f FOLDS] [-r RUNS]
+                 [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]
+                 {core,nlu} ..."""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_test_nlu_help(run: Callable[..., RunResult]):
+    output = run("test", "nlu", "--help")
+
+    help_text = """usage: rasa test nlu [-h] [-v] [-vv] [--quiet] [-m MODEL] [-u NLU] [--out OUT]
+                     [--successes] [--no-errors] [--histogram HISTOGRAM]
+                     [--confmat CONFMAT] [-c CONFIG [CONFIG ...]]
+                     [--cross-validation] [-f FOLDS] [-r RUNS]
+                     [-p PERCENTAGES [PERCENTAGES ...]] [--no-plot]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_test_core_help(run: Callable[..., RunResult]):
+    output = run("test", "core", "--help")
+
+    help_text = """usage: rasa test core [-h] [-v] [-vv] [--quiet] [-m MODEL [MODEL ...]]
+                      [-s STORIES] [--max-stories MAX_STORIES] [--out OUT]
+                      [--e2e] [--endpoints ENDPOINTS]
+                      [--fail-on-prediction-errors] [--url URL]
+                      [--evaluate-model-directory] [--no-plot]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
new file mode 100644
index 000000000000..1839c77b3ab9
--- /dev/null
+++ b/tests/cli/test_rasa_train.py
@@ -0,0 +1,464 @@
+import os
+import shutil
+import tempfile
+
+import pytest
+from typing import Callable
+from _pytest.pytester import RunResult
+
+from rasa import model
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import training_data
+from rasa.cli.train import _get_valid_config
+from rasa.constants import (
+    CONFIG_MANDATORY_KEYS_CORE,
+    CONFIG_MANDATORY_KEYS,
+    CONFIG_MANDATORY_KEYS_NLU,
+)
+import rasa.utils.io as io_utils
+
+
+def test_train(run_in_default_project: Callable[..., RunResult]):
+    temp_dir = os.getcwd()
+
+    run_in_default_project(
+        "train",
+        "-c",
+        "config.yml",
+        "-d",
+        "domain.yml",
+        "--data",
+        "data",
+        "--out",
+        "train_models",
+        "--fixed-model-name",
+        "test-model",
+    )
+
+    assert os.path.exists(os.path.join(temp_dir, "train_models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is None
+    assert not os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]):
+    temp_dir = os.getcwd()
+
+    run_in_default_project(
+        "train",
+        "-c",
+        "config.yml",
+        "-d",
+        "domain.yml",
+        "--data",
+        "data",
+        "--out",
+        "train_models",
+        "--fixed-model-name",
+        "test-model",
+        "--persist-nlu-data",
+    )
+
+    assert os.path.exists(os.path.join(temp_dir, "train_models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is not None
+    assert os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
+    temp_dir = os.getcwd()
+
+    io_utils.write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "KerasPolicy"}],
+        },
+        "config_1.yml",
+    )
+
+    io_utils.write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "MemoizationPolicy"}],
+        },
+        "config_2.yml",
+    )
+
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config_1.yml",
+        "config_2.yml",
+        "--stories",
+        "data/stories.md",
+        "--out",
+        "core_comparison_results",
+        "--runs",
+        "2",
+        "--percentages",
+        "25",
+        "75",
+        "--augmentation",
+        "5",
+    )
+
+    assert os.path.exists(os.path.join(temp_dir, "core_comparison_results"))
+    run_directories = io_utils.list_subdirectories(
+        os.path.join(temp_dir, "core_comparison_results")
+    )
+    assert len(run_directories) == 2
+    model_files = io_utils.list_files(
+        os.path.join(temp_dir, "core_comparison_results", run_directories[0])
+    )
+    assert len(model_files) == 4
+    assert model_files[0].endswith("tar.gz")
+
+
+def test_train_no_domain_exists(
+    run_in_default_project: Callable[..., RunResult]
+) -> None:
+
+    os.remove("domain.yml")
+    run_in_default_project(
+        "train",
+        "-c",
+        "config.yml",
+        "--data",
+        "data",
+        "--out",
+        "train_models_no_domain",
+        "--fixed-model-name",
+        "nlu-model-only",
+    )
+
+    assert os.path.exists("train_models_no_domain")
+    files = io_utils.list_files("train_models_no_domain")
+    assert len(files) == 1
+
+    trained_model_path = "train_models_no_domain/nlu-model-only.tar.gz"
+    unpacked = model.unpack_model(trained_model_path)
+
+    metadata_path = os.path.join(unpacked, "nlu", "metadata.json")
+    assert os.path.exists(metadata_path)
+
+
+def test_train_skip_on_model_not_changed(
+    run_in_default_project: Callable[..., RunResult]
+):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+
+    file_name = files[0]
+    run_in_default_project("train")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+    assert file_name == files[0]
+
+
+def test_train_force(run_in_default_project):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+
+    run_in_default_project("train", "--force")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 2
+
+
+def test_train_with_only_nlu_data(run_in_default_project):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "data/stories.md"))
+    os.remove(os.path.join(temp_dir, "data/stories.md"))
+    shutil.rmtree(os.path.join(temp_dir, "models"))
+
+    run_in_default_project("train", "--fixed-model-name", "test-model")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+
+
+def test_train_with_only_core_data(run_in_default_project):
+    temp_dir = os.getcwd()
+
+    assert os.path.exists(os.path.join(temp_dir, "data/nlu.md"))
+    os.remove(os.path.join(temp_dir, "data/nlu.md"))
+    shutil.rmtree(os.path.join(temp_dir, "models"))
+
+    run_in_default_project("train", "--fixed-model-name", "test-model")
+
+    assert os.path.exists(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
+    assert len(files) == 1
+    assert os.path.basename(files[0]) == "test-model.tar.gz"
+
+
+def test_train_core(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config.yml",
+        "-d",
+        "domain.yml",
+        "--stories",
+        "data",
+        "--out",
+        "train_rasa_models",
+        "--fixed-model-name",
+        "rasa-model",
+    )
+
+    assert os.path.exists("train_rasa_models/rasa-model.tar.gz")
+    assert os.path.isfile("train_rasa_models/rasa-model.tar.gz")
+
+
+def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunResult]):
+
+    os.remove("domain.yml")
+    run_in_default_project(
+        "train",
+        "core",
+        "--config",
+        "config.yml",
+        "--domain",
+        "domain1.yml",
+        "--stories",
+        "data",
+        "--out",
+        "train_rasa_models_no_domain",
+        "--fixed-model-name",
+        "rasa-model",
+    )
+
+    assert not os.path.exists("train_rasa_models_no_domain/rasa-model.tar.gz")
+    assert not os.path.isfile("train_rasa_models_no_domain/rasa-model.tar.gz")
+
+
+def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
+    run_in_default_project(
+        "train",
+        "nlu",
+        "-c",
+        "config.yml",
+        "--nlu",
+        "data/nlu.md",
+        "--out",
+        "train_models",
+    )
+
+    assert os.path.exists("train_models")
+    files = io_utils.list_files("train_models")
+    assert len(files) == 1
+    assert os.path.basename(files[0]).startswith("nlu-")
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is None
+    assert not os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_nlu_persist_nlu_data(
+    run_in_default_project: Callable[..., RunResult]
+) -> None:
+    run_in_default_project(
+        "train",
+        "nlu",
+        "-c",
+        "config.yml",
+        "--nlu",
+        "data/nlu.md",
+        "--out",
+        "train_models",
+        "--persist-nlu-data",
+    )
+
+    assert os.path.exists("train_models")
+    files = io_utils.list_files("train_models")
+    assert len(files) == 1
+    assert os.path.basename(files[0]).startswith("nlu-")
+    model_dir = model.get_model("train_models")
+    assert model_dir is not None
+    metadata = Metadata.load(os.path.join(model_dir, "nlu"))
+    assert metadata.get("training_data") is not None
+    assert os.path.exists(
+        os.path.join(model_dir, "nlu", training_data.DEFAULT_TRAINING_DATA_OUTPUT_PATH)
+    )
+
+
+def test_train_help(run):
+    output = run("train", "--help")
+
+    help_text = """usage: rasa train [-h] [-v] [-vv] [--quiet] [--data DATA [DATA ...]]
+                  [-c CONFIG] [-d DOMAIN] [--out OUT]
+                  [--augmentation AUGMENTATION] [--debug-plots]
+                  [--dump-stories] [--fixed-model-name FIXED_MODEL_NAME]
+                  [--persist-nlu-data] [--force]
+                  {core,nlu} ..."""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_train_nlu_help(run: Callable[..., RunResult]):
+    output = run("train", "nlu", "--help")
+
+    help_text = """usage: rasa train nlu [-h] [-v] [-vv] [--quiet] [-c CONFIG] [--out OUT]
+                      [-u NLU] [--fixed-model-name FIXED_MODEL_NAME]
+                      [--persist-nlu-data]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_train_core_help(run: Callable[..., RunResult]):
+    output = run("train", "core", "--help")
+
+    help_text = """usage: rasa train core [-h] [-v] [-vv] [--quiet] [-s STORIES] [-d DOMAIN]
+                       [-c CONFIG [CONFIG ...]] [--out OUT]
+                       [--augmentation AUGMENTATION] [--debug-plots]
+                       [--dump-stories] [--force]
+                       [--fixed-model-name FIXED_MODEL_NAME]
+                       [--percentages [PERCENTAGES [PERCENTAGES ...]]]
+                       [--runs RUNS]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+@pytest.mark.parametrize(
+    "parameters",
+    [
+        {
+            "config_data": {"language": "en", "pipeline": "supervised"},
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
+            "error": True,
+        },
+        {
+            "config_data": {},
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS,
+            "error": True,
+        },
+        {
+            "config_data": {
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+                "imports": "other-folder",
+            },
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
+            "error": True,
+        },
+        {
+            "config_data": None,
+            "default_config": {
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_NLU,
+            "error": True,
+        },
+        {
+            "config_data": None,
+            "default_config": {
+                "language": "en",
+                "pipeline": "supervised",
+                "policies": ["KerasPolicy", "FallbackPolicy"],
+            },
+            "mandatory_keys": CONFIG_MANDATORY_KEYS,
+            "error": False,
+        },
+        {
+            "config_data": None,
+            "default_config": {"language": "en", "pipeline": "supervised"},
+            "mandatory_keys": CONFIG_MANDATORY_KEYS_CORE,
+            "error": True,
+        },
+        {
+            "config_data": None,
+            "default_config": None,
+            "mandatory_keys": CONFIG_MANDATORY_KEYS,
+            "error": True,
+        },
+    ],
+)
+def test_get_valid_config(parameters):
+    import rasa.utils.io
+
+    config_path = None
+    if parameters["config_data"] is not None:
+        config_path = os.path.join(tempfile.mkdtemp(), "config.yml")
+        rasa.utils.io.write_yaml_file(parameters["config_data"], config_path)
+
+    default_config_path = None
+    if parameters["default_config"] is not None:
+        default_config_path = os.path.join(tempfile.mkdtemp(), "default-config.yml")
+        rasa.utils.io.write_yaml_file(parameters["default_config"], default_config_path)
+
+    if parameters["error"]:
+        with pytest.raises(SystemExit):
+            _get_valid_config(config_path, parameters["mandatory_keys"])
+
+    else:
+        config_path = _get_valid_config(
+            config_path, parameters["mandatory_keys"], default_config_path
+        )
+
+        config_data = rasa.utils.io.read_yaml_file(config_path)
+
+        for k in parameters["mandatory_keys"]:
+            assert k in config_data
+
+
+def test_get_valid_config_with_non_existing_file():
+    with pytest.raises(SystemExit):
+        _get_valid_config("non-existing-file.yml", CONFIG_MANDATORY_KEYS)
diff --git a/tests/cli/test_rasa_visualize.py b/tests/cli/test_rasa_visualize.py
new file mode 100644
index 000000000000..f69115c9529b
--- /dev/null
+++ b/tests/cli/test_rasa_visualize.py
@@ -0,0 +1,15 @@
+from typing import Callable
+from _pytest.pytester import RunResult
+
+
+def test_visualize_help(run: Callable[..., RunResult]):
+    output = run("visualize", "--help")
+
+    help_text = """usage: rasa visualize [-h] [-v] [-vv] [--quiet] [-d DOMAIN] [-s STORIES]
+                      [-c CONFIG] [--out OUT] [--max-history MAX_HISTORY]
+                      [-u NLU]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
diff --git a/tests/cli/test_rasa_x.py b/tests/cli/test_rasa_x.py
new file mode 100644
index 000000000000..656ec082b8a4
--- /dev/null
+++ b/tests/cli/test_rasa_x.py
@@ -0,0 +1,152 @@
+from pathlib import Path
+
+import pytest
+from typing import Callable, Dict
+from _pytest.pytester import RunResult
+
+
+from aioresponses import aioresponses
+
+import rasa.utils.io as io_utils
+from rasa.cli import x
+from rasa.utils.endpoints import EndpointConfig
+from rasa.core.utils import AvailableEndpoints
+
+
+def test_x_help(run: Callable[..., RunResult]):
+    output = run("x", "--help")
+
+    help_text = """usage: rasa x [-h] [-v] [-vv] [--quiet] [-m MODEL] [--data DATA] [-c CONFIG]
+              [--no-prompt] [--production] [--rasa-x-port RASA_X_PORT]
+              [--config-endpoint CONFIG_ENDPOINT] [--log-file LOG_FILE]
+              [--endpoints ENDPOINTS] [-p PORT] [-t AUTH_TOKEN]
+              [--cors [CORS [CORS ...]]] [--enable-api]
+              [--remote-storage REMOTE_STORAGE]
+              [--ssl-certificate SSL_CERTIFICATE] [--ssl-keyfile SSL_KEYFILE]
+              [--ssl-ca-file SSL_CA_FILE] [--ssl-password SSL_PASSWORD]
+              [--credentials CREDENTIALS] [--connector CONNECTOR]
+              [--jwt-secret JWT_SECRET] [--jwt-method JWT_METHOD]"""
+
+    lines = help_text.split("\n")
+
+    for i, line in enumerate(lines):
+        assert output.outlines[i] == line
+
+
+def test_prepare_credentials_for_rasa_x_if_rasa_channel_not_given(tmpdir: Path):
+    credentials_path = str(tmpdir / "credentials.yml")
+
+    io_utils.write_yaml_file({}, credentials_path)
+
+    tmp_credentials = x._prepare_credentials_for_rasa_x(
+        credentials_path, "http://localhost:5002"
+    )
+
+    actual = io_utils.read_config_file(tmp_credentials)
+
+    assert actual["rasa"]["url"] == "http://localhost:5002"
+
+
+def test_prepare_credentials_if_already_valid(tmpdir: Path):
+    credentials_path = str(tmpdir / "credentials.yml")
+
+    credentials = {
+        "rasa": {"url": "my-custom-url"},
+        "another-channel": {"url": "some-url"},
+    }
+    io_utils.write_yaml_file(credentials, credentials_path)
+
+    x._prepare_credentials_for_rasa_x(credentials_path)
+
+    actual = io_utils.read_config_file(credentials_path)
+
+    assert actual == credentials
+
+
+def test_if_default_endpoint_config_is_valid_in_local_mode():
+    event_broker_endpoint = x._get_event_broker_endpoint(None)
+
+    assert x._is_correct_event_broker(event_broker_endpoint)
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"type": "mongo", "url": "mongodb://localhost:27017"},
+        {"type": "sql", "dialect": "postgresql"},
+        {"type": "sql", "dialect": "sqlite", "db": "some.db"},
+    ],
+)
+def test_if_endpoint_config_is_invalid_in_local_mode(kwargs: Dict):
+    config = EndpointConfig(**kwargs)
+    assert not x._is_correct_event_broker(config)
+
+
+def test_overwrite_model_server_url():
+    endpoint_config = EndpointConfig(url="http://testserver:5002/models/default@latest")
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert (
+        endpoints.model.url
+        == "http://localhost/projects/default/models/tags/production"
+    )
+
+
+def test_overwrite_model_server_url_with_no_model_endpoint():
+    endpoints = AvailableEndpoints()
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert (
+        endpoints.model.url
+        == "http://localhost/projects/default/models/tags/production"
+    )
+
+
+def test_reuse_wait_time_between_pulls():
+    test_wait_time = 5
+    endpoint_config = EndpointConfig(
+        url="http://localhost:5002/models/default@latest",
+        wait_time_between_pulls=test_wait_time,
+    )
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    assert endpoints.model.kwargs["wait_time_between_pulls"] == test_wait_time
+
+
+def test_default_wait_time_between_pulls():
+    endpoint_config = EndpointConfig(url="http://localhost:5002/models/default@latest")
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert endpoints.model.kwargs["wait_time_between_pulls"] == 2
+
+
+def test_default_model_server_url():
+    endpoint_config = EndpointConfig()
+    endpoints = AvailableEndpoints(model=endpoint_config)
+    x._overwrite_endpoints_for_local_x(endpoints, "test", "http://localhost")
+    assert (
+        endpoints.model.url
+        == "http://localhost/projects/default/models/tags/production"
+    )
+
+
+async def test_pull_runtime_config_from_server():
+    config_url = "http://example.com/api/config?token=token"
+    credentials = "rasa: http://example.com:5002/api"
+    endpoint_config = """
+    event_broker:
+        url: http://example.com/event_broker
+        username: some_username
+        password: PASSWORD
+        queue: broker_queue
+    """
+    with aioresponses() as mocked:
+        mocked.get(
+            config_url,
+            payload={"credentials": credentials, "endpoints": endpoint_config},
+        )
+
+        endpoints_path, credentials_path = await x._pull_runtime_config_from_server(
+            config_url, 1, 0
+        )
+
+        assert io_utils.read_file(endpoints_path) == endpoint_config
+        assert io_utils.read_file(credentials_path) == credentials
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
new file mode 100644
index 000000000000..4d270be3964b
--- /dev/null
+++ b/tests/cli/test_utils.py
@@ -0,0 +1,105 @@
+import contextlib
+import logging
+import os
+import pathlib
+import sys
+import tempfile
+
+import pytest
+from _pytest.logging import LogCaptureFixture
+
+import rasa.cli.utils
+from rasa.cli.utils import (
+    parse_last_positional_argument_as_model_path,
+    get_validated_path,
+)
+
+
+@contextlib.contextmanager
+def make_actions_subdir():
+    """Create a subdir called actions to test model argument handling."""
+    with tempfile.TemporaryDirectory() as tempdir:
+        cwd = os.getcwd()
+        os.chdir(tempdir)
+        try:
+            (pathlib.Path(tempdir) / "actions").mkdir()
+            yield
+        finally:
+            os.chdir(cwd)
+
+
+@pytest.mark.parametrize(
+    "argv",
+    [
+        ["rasa", "run"],
+        ["rasa", "run", "actions"],
+        ["rasa", "run", "core"],
+        ["rasa", "interactive", "nlu", "--param", "xy"],
+    ],
+)
+def test_parse_last_positional_argument_as_model_path(argv):
+    with make_actions_subdir():
+        test_model_dir = tempfile.gettempdir()
+        argv.append(test_model_dir)
+
+        sys.argv = argv.copy()
+        parse_last_positional_argument_as_model_path()
+
+        assert sys.argv[-2] == "--model"
+        assert sys.argv[-1] == test_model_dir
+
+
+@pytest.mark.parametrize(
+    "argv",
+    [
+        ["rasa", "run"],
+        ["rasa", "run", "actions"],
+        ["rasa", "run", "core"],
+        ["rasa", "test", "nlu", "--param", "xy", "--model", "test"],
+    ],
+)
+def test_parse_no_positional_model_path_argument(argv):
+    with make_actions_subdir():
+        sys.argv = argv.copy()
+
+        parse_last_positional_argument_as_model_path()
+
+        assert sys.argv == argv
+
+
+def test_validate_invalid_path():
+    with pytest.raises(SystemExit):
+        get_validated_path("test test test", "out", "default")
+
+
+def test_validate_valid_path():
+    tempdir = tempfile.mkdtemp()
+
+    assert get_validated_path(tempdir, "out", "default") == tempdir
+
+
+def test_validate_if_none_is_valid():
+    assert get_validated_path(None, "out", "default", True) is None
+
+
+def test_validate_with_none_if_default_is_valid(caplog: LogCaptureFixture):
+    tempdir = tempfile.mkdtemp()
+
+    with caplog.at_level(logging.WARNING, rasa.cli.utils.logger.name):
+        assert get_validated_path(None, "out", tempdir) == tempdir
+
+    assert caplog.records == []
+
+
+def test_validate_with_invalid_directory_if_default_is_valid(caplog: LogCaptureFixture):
+    tempdir = tempfile.mkdtemp()
+    invalid_directory = "gcfhvjkb"
+    with pytest.warns(UserWarning) as record:
+        assert get_validated_path(invalid_directory, "out", tempdir) == tempdir
+    assert len(record) == 1
+    assert "does not seem to exist" in record[0].message.args[0]
+
+
+def test_print_error_and_exit():
+    with pytest.raises(SystemExit):
+        rasa.cli.utils.print_error_and_exit("")

From 4108753f547db256afb70f27a00c8c3759831781 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 11 Feb 2020 22:21:33 +0100
Subject: [PATCH 356/633] black

---
 rasa/utils/tensorflow/layers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 6b776cf04563..6ae325e7bee9 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -155,11 +155,9 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
 
 
 class InputMask(tf.keras.layers.Layer):
-
     def build(self, input_shape: tf.TensorShape) -> None:
         self.mask_vector = self.add_weight(
-            shape=(1, 1, input_shape[-1]),
-            name="mask_vector",
+            shape=(1, 1, input_shape[-1]), name="mask_vector"
         )
         self.built = True
 

From d2dee3dcb4ddd2fb5c6c357e2694005c0fa1c1b8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 12 Feb 2020 00:50:45 +0100
Subject: [PATCH 357/633] fix tag_ids test

---
 rasa/nlu/classifiers/diet_classifier.py              |  2 +-
 rasa/nlu/selectors/response_selector.py              |  2 +-
 .../{test_tf_model_data.py => test_model_data.py}    | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)
 rename tests/utils/{test_tf_model_data.py => test_model_data.py} (95%)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 91086940aa7f..464308b3a09e 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -262,7 +262,7 @@ def __init__(
         self.data_example = None
 
     @property
-    def label_key(self):
+    def label_key(self) -> Text:
         return "label_ids" if self.component_config[INTENT_CLASSIFICATION] else None
 
     @staticmethod
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index e6672586a1e2..3732d0ff69b0 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -176,7 +176,7 @@ class ResponseSelector(DIETClassifier):
     # end default properties (DOC MARKER - don't remove)
 
     @property
-    def label_key(self):
+    def label_key(self) -> Text:
         return "label_ids"
 
     @staticmethod
diff --git a/tests/utils/test_tf_model_data.py b/tests/utils/test_model_data.py
similarity index 95%
rename from tests/utils/test_tf_model_data.py
rename to tests/utils/test_model_data.py
index 6d30deaf6f74..2d5a2d5b7d9f 100644
--- a/tests/utils/test_tf_model_data.py
+++ b/tests/utils/test_model_data.py
@@ -47,11 +47,11 @@ async def model_data() -> RasaModelData:
             "tag_ids": [
                 np.array(
                     [
-                        np.array([0, 1, 1, 0, 2]),
-                        np.array([2, 0]),
-                        np.array([0, 1, 1]),
-                        np.array([0, 1]),
-                        np.array([0, 0, 0]),
+                        np.array([[0], [1], [1], [0], [2]]),
+                        np.array([[2], [0]]),
+                        np.array([[0], [1], [1]]),
+                        np.array([[0], [1]]),
+                        np.array([[0], [0], [0]]),
                     ]
                 )
             ],
@@ -152,7 +152,7 @@ def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
 
 def test_gen_batch(model_data: RasaModelData):
     iterator = model_data._gen_batch(2, shuffle=True, batch_strategy="balanced")
-
+    print(model_data.data["tag_ids"][0])
     batch = next(iterator)
     assert len(batch) == 7
     assert len(batch[0]) == 2

From da3c49e1bdd7d806f8829cbb851aec7309606fc6 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 12 Feb 2020 00:54:19 +0100
Subject: [PATCH 358/633] add blank lines after doc strings

---
 rasa/utils/tensorflow/layers.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 4ea7715d5d10..cc69955e1873 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -33,7 +33,7 @@ def dropped_inputs() -> tf.Tensor:
 
 
 class DenseForSparse(tf.keras.layers.Dense):
-    """Dense layer for sparse input tensor"""
+    """Dense layer for sparse input tensor."""
 
     def __init__(self, reg_lambda: float = 0, **kwargs) -> None:
         if reg_lambda > 0:
@@ -171,6 +171,7 @@ def call(
         training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Randomly mask input sequences."""
+
         if training is None:
             training = K.learning_phase()
 
@@ -279,6 +280,7 @@ def _make_flat(x: tf.Tensor) -> tf.Tensor:
     def _random_indices(self, batch_size: tf.Tensor, total_candidates: tf.Tensor):
         def rand_idxs():
             """Create random tensor of indices"""
+
             # (1, num_neg)
             return tf.expand_dims(
                 tf.random.shuffle(tf.range(total_candidates))[: self.num_neg], 0
@@ -450,7 +452,7 @@ def _train_sim(
 
     @staticmethod
     def _calc_accuracy(sim_pos: tf.Tensor, sim_neg: tf.Tensor) -> tf.Tensor:
-        """Calculate accuracy"""
+        """Calculate accuracy."""
 
         max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
         return tf.reduce_mean(

From 7276b084f9b508cb26d2c336962f1dcbadbed95c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 08:43:18 +0100
Subject: [PATCH 359/633] refactor cli tests

---
 tests/cli/conftest.py        | 52 +++++++++++++++++++++++++++++++++++-
 tests/cli/test_rasa_data.py  | 12 ++++++---
 tests/cli/test_rasa_run.py   |  7 ++---
 tests/cli/test_rasa_test.py  | 13 +++++++--
 tests/cli/test_rasa_train.py | 48 ++++++++++++++++++---------------
 5 files changed, 100 insertions(+), 32 deletions(-)

diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
index b7294eaeadff..ee8d741639c9 100644
--- a/tests/cli/conftest.py
+++ b/tests/cli/conftest.py
@@ -1,8 +1,11 @@
 from typing import Callable
 import pytest
+import shutil
 import os
 from _pytest.pytester import Testdir, RunResult
 
+from rasa.utils.io import write_yaml_file
+
 
 @pytest.fixture
 def run(testdir: Testdir) -> Callable[..., RunResult]:
@@ -22,13 +25,60 @@ def do_run(*args, stdin):
     return do_run
 
 
+@pytest.fixture
+def run_in_default_project_without_models(testdir: Testdir) -> Callable[..., RunResult]:
+    os.environ["LOG_LEVEL"] = "ERROR"
+
+    _set_up_initial_project(testdir)
+
+    def do_run(*args):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args)
+
+    return do_run
+
+
 @pytest.fixture
 def run_in_default_project(testdir: Testdir) -> Callable[..., RunResult]:
     os.environ["LOG_LEVEL"] = "ERROR"
-    testdir.run("rasa", "init", "--no-prompt")
+
+    _set_up_initial_project(testdir)
+
+    testdir.run("rasa", "train")
 
     def do_run(*args):
         args = ["rasa"] + list(args)
         return testdir.run(*args)
 
     return do_run
+
+
+def _set_up_initial_project(testdir: Testdir):
+    # copy initial project files
+    testdir.copy_example("rasa/cli/initial_project/actions.py")
+    testdir.copy_example("rasa/cli/initial_project/credentials.yml")
+    testdir.copy_example("rasa/cli/initial_project/domain.yml")
+    testdir.copy_example("rasa/cli/initial_project/endpoints.yml")
+    testdir.mkdir("data")
+    testdir.copy_example("rasa/cli/initial_project/data")
+    testdir.run("mv", "nlu.md", "data/nlu.md")
+    testdir.run("mv", "stories.md", "data/stories.md")
+
+    # create a config file
+    # for the cli test the resulting model is not important, use components that are
+    # fast to train
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "KeywordIntentClassifier"},
+            ],
+            "policies": [
+                {"name": "MappingPolicy"},
+                {"name": "MemoizationPolicy", "max_history": 5},
+            ],
+        },
+        "config.yml",
+    )
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
index 3021e9ab12e7..c5cee11bc560 100644
--- a/tests/cli/test_rasa_data.py
+++ b/tests/cli/test_rasa_data.py
@@ -6,8 +6,10 @@
 from rasa.cli import data
 
 
-def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_data_split_nlu(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
+    run_in_default_project_without_models(
         "data", "split", "nlu", "-u", "data/nlu.md", "--training-fraction", "0.75"
     )
 
@@ -16,8 +18,10 @@ def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists(os.path.join("train_test_split", "training_data.md"))
 
 
-def test_data_convert_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_data_convert_nlu(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
+    run_in_default_project_without_models(
         "data",
         "convert",
         "nlu",
diff --git a/tests/cli/test_rasa_run.py b/tests/cli/test_rasa_run.py
index 8de8685e91dd..7a054b26619a 100644
--- a/tests/cli/test_rasa_run.py
+++ b/tests/cli/test_rasa_run.py
@@ -4,12 +4,13 @@
 from _pytest.pytester import RunResult
 
 
-def test_run_does_not_start(run_in_default_project: Callable[..., RunResult]):
+def test_run_does_not_start(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     os.remove("domain.yml")
-    shutil.rmtree("models")
 
     # the server should not start as no model is configured
-    output = run_in_default_project("run")
+    output = run_in_default_project_without_models("run")
 
     assert "No model found." in output.outlines[0]
 
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 50d21c6e6978..ac9a6602edb0 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -59,10 +59,18 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
 
 
 def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
-    copyfile("config.yml", "nlu-config.yml")
+    copyfile("config.yml", "config-1.yml")
 
     run_in_default_project(
-        "test", "nlu", "-c", "config.yml", "nlu-config.yml", "--run", "2"
+        "test",
+        "nlu",
+        "-config",
+        "config.yml",
+        "config-1.yml",
+        "--run",
+        "2",
+        "-percentages",
+        "75",
     )
 
     assert os.path.exists("results/run_1")
@@ -106,6 +114,7 @@ def test_test_core_comparison_after_train(
         },
         "config_2.yml",
     )
+
     run_in_default_project(
         "train",
         "core",
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
index 1839c77b3ab9..68ebcff0b1e6 100644
--- a/tests/cli/test_rasa_train.py
+++ b/tests/cli/test_rasa_train.py
@@ -18,10 +18,10 @@
 import rasa.utils.io as io_utils
 
 
-def test_train(run_in_default_project: Callable[..., RunResult]):
+def test_train(run_in_default_project_without_models: Callable[..., RunResult]):
     temp_dir = os.getcwd()
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "-c",
         "config.yml",
@@ -48,10 +48,12 @@ def test_train(run_in_default_project: Callable[..., RunResult]):
     )
 
 
-def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]):
+def test_train_persist_nlu_data(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     temp_dir = os.getcwd()
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "-c",
         "config.yml",
@@ -79,7 +81,9 @@ def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]
     )
 
 
-def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
+def test_train_core_compare(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     temp_dir = os.getcwd()
 
     io_utils.write_yaml_file(
@@ -100,7 +104,7 @@ def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
         "config_2.yml",
     )
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "core",
         "-c",
@@ -132,11 +136,11 @@ def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
 
 
 def test_train_no_domain_exists(
-    run_in_default_project: Callable[..., RunResult]
+    run_in_default_project_without_models: Callable[..., RunResult]
 ) -> None:
 
     os.remove("domain.yml")
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "-c",
         "config.yml",
@@ -191,14 +195,13 @@ def test_train_force(run_in_default_project):
     assert len(files) == 2
 
 
-def test_train_with_only_nlu_data(run_in_default_project):
+def test_train_with_only_nlu_data(run_in_default_project_without_models):
     temp_dir = os.getcwd()
 
     assert os.path.exists(os.path.join(temp_dir, "data/stories.md"))
     os.remove(os.path.join(temp_dir, "data/stories.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
 
-    run_in_default_project("train", "--fixed-model-name", "test-model")
+    run_in_default_project_without_models("train", "--fixed-model-name", "test-model")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
     files = io_utils.list_files(os.path.join(temp_dir, "models"))
@@ -206,14 +209,13 @@ def test_train_with_only_nlu_data(run_in_default_project):
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
 
-def test_train_with_only_core_data(run_in_default_project):
+def test_train_with_only_core_data(run_in_default_project_without_models):
     temp_dir = os.getcwd()
 
     assert os.path.exists(os.path.join(temp_dir, "data/nlu.md"))
     os.remove(os.path.join(temp_dir, "data/nlu.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
 
-    run_in_default_project("train", "--fixed-model-name", "test-model")
+    run_in_default_project_without_models("train", "--fixed-model-name", "test-model")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
     files = io_utils.list_files(os.path.join(temp_dir, "models"))
@@ -221,8 +223,8 @@ def test_train_with_only_core_data(run_in_default_project):
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
 
-def test_train_core(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_train_core(run_in_default_project_without_models: Callable[..., RunResult]):
+    run_in_default_project_without_models(
         "train",
         "core",
         "-c",
@@ -241,10 +243,12 @@ def test_train_core(run_in_default_project: Callable[..., RunResult]):
     assert os.path.isfile("train_rasa_models/rasa-model.tar.gz")
 
 
-def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunResult]):
+def test_train_core_no_domain_exists(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
 
     os.remove("domain.yml")
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "core",
         "--config",
@@ -263,8 +267,8 @@ def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunRe
     assert not os.path.isfile("train_rasa_models_no_domain/rasa-model.tar.gz")
 
 
-def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_train_nlu(run_in_default_project_without_models: Callable[..., RunResult]):
+    run_in_default_project_without_models(
         "train",
         "nlu",
         "-c",
@@ -289,9 +293,9 @@ def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
 
 
 def test_train_nlu_persist_nlu_data(
-    run_in_default_project: Callable[..., RunResult]
+    run_in_default_project_without_models: Callable[..., RunResult]
 ) -> None:
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "nlu",
         "-c",

From 4898894e45d890765a98ea9fdef161d4320308cb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 08:44:16 +0100
Subject: [PATCH 360/633] fix docs

---
 docs/nlu/components.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c5f6c2a8a060..d1a9e41a6ebe 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -685,7 +685,7 @@ Response Selector
     In the config, you can specify these parameters.
     The default values are defined in ``ResponseSelector.defaults``:
 
-    .. literalinclude:: ../../rasa/nlu/selectors/embedding_response_selector.py
+    .. literalinclude:: ../../rasa/nlu/selectors/response_selector.py
        :dedent: 4
        :start-after: # default properties (DOC MARKER - don't remove)
        :end-before: # end default properties (DOC MARKER - don't remove)

From fc561d0d786db5840d8d8e406cb0e5bf970a7776 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 08:47:22 +0100
Subject: [PATCH 361/633] fix type

---
 rasa/nlu/classifiers/diet_classifier.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 464308b3a09e..d5197e4df3a0 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -262,7 +262,7 @@ def __init__(
         self.data_example = None
 
     @property
-    def label_key(self) -> Text:
+    def label_key(self) -> Optional[Text]:
         return "label_ids" if self.component_config[INTENT_CLASSIFICATION] else None
 
     @staticmethod
@@ -1154,7 +1154,7 @@ def _combine_sparse_dense_features(
         return tf.concat(dense_features, axis=-1) * mask
 
     def _features_as_seq_ids(
-        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text,
+        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
     ) -> tf.Tensor:
         # if there are dense features it's enough
         for f in features:
@@ -1254,11 +1254,7 @@ def _mask_loss(
         a_masked_embed = self._tf_layers[f"embed.{name}_golden_token"](a_masked)
 
         return self._tf_layers[f"loss.{name}_mask"](
-            a_t_masked_embed,
-            a_masked_embed,
-            a_masked_ids,
-            a_masked_embed,
-            a_masked_ids,
+            a_t_masked_embed, a_masked_embed, a_masked_ids, a_masked_embed, a_masked_ids
         )
 
     def _label_loss(

From 3eac5e2764f4168890a444f9a6b5834635682790 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 09:03:07 +0100
Subject: [PATCH 362/633] reduce number of epochs in nlu tests

---
 tests/nlu/training/test_train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 0b2d416927db..bda988331b98 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -6,12 +6,13 @@
 from rasa.nlu.model import Interpreter, Trainer
 from rasa.nlu.train import create_persistor
 from rasa.nlu.training_data import TrainingData
+from rasa.utils.tensorflow.constants import EPOCHS
 from tests.nlu import utilities
 from tests.nlu.conftest import DEFAULT_DATA_PATH
 
 
 def as_pipeline(*components):
-    return [{"name": c} for c in components]
+    return [{"name": c, EPOCHS: 3} for c in components]
 
 
 def pipelines_for_tests():

From da4f26f57fc46c3bffbb907a534890fa448b2dc7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 09:38:57 +0100
Subject: [PATCH 363/633] Fix attribute error of pytype

---
 rasa/nlu/classifiers/diet_classifier.py       |  6 ++-
 tests/nlu/classifiers/test_diet_classifier.py | 39 +++++++++++--------
 .../test_keyword_classifier.py}               |  1 -
 tests/nlu/conftest.py                         |  4 +-
 4 files changed, 31 insertions(+), 19 deletions(-)
 rename tests/nlu/{base/test_classifiers.py => classifiers/test_keyword_classifier.py} (99%)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index d5197e4df3a0..41bcfbd578b1 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1281,7 +1281,9 @@ def _entity_loss(
 
         # should call first to build weights
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
-        loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
+        loss = self._tf_layers["crf"].loss(
+            logits, c, sequence_lengths
+        )  # pytype: disable=attribute-error
 
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
@@ -1378,12 +1380,14 @@ def batch_predict(
             cls = self._last_token(text_transformed, sequence_lengths)
             cls_embed = self._tf_layers["embed.text"](cls)
 
+            # pytype: disable=attribute-error
             sim_all = self._tf_layers["loss.label"].sim(
                 cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
             )
             scores = self._tf_layers["loss.label"].confidence_from_sim(
                 sim_all, self.config[SIMILARITY_TYPE]
             )
+            # pytype: enable=attribute-error
             out["i_scores"] = scores
 
         if self.config[ENTITY_RECOGNITION]:
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index c555a28f1215..0f556551d4b5 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -6,18 +6,17 @@
 from rasa.nlu import train
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.constants import (
-    TEXT,
-    SPARSE_FEATURE_NAMES,
-    DENSE_FEATURE_NAMES,
-    INTENT,
+from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, INTENT
+from rasa.utils.tensorflow.constants import (
+    LOSS_TYPE,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    EPOCHS,
 )
-from rasa.utils.tensorflow.constants import LOSS_TYPE, RANDOM_SEED, RANKING_LENGTH
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.model import Interpreter
 from rasa.nlu.training_data import Message
 from rasa.utils import train_utils
-from tests.nlu import utilities
 from tests.nlu.conftest import DEFAULT_DATA_PATH
 
 
@@ -88,12 +87,12 @@ def test_check_labels_features_exist(messages, expected):
             {"name": "ConveRTTokenizer"},
             {"name": "CountVectorsFeaturizer"},
             {"name": "ConveRTFeaturizer"},
-            {"name": "DIETClassifier"},
+            {"name": "DIETClassifier", EPOCHS: 3},
         ],
         [
             {"name": "WhitespaceTokenizer"},
             {"name": "CountVectorsFeaturizer"},
-            {"name": "DIETClassifier", LOSS_TYPE: "margin"},
+            {"name": "DIETClassifier", LOSS_TYPE: "margin", EPOCHS: 3},
         ],
     ],
 )
@@ -125,7 +124,10 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
 
     _config = RasaNLUModelConfig(
         {
-            "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "DIETClassifier"}],
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "DIETClassifier", EPOCHS: 3},
+            ],
             "language": "en",
         }
     )
@@ -151,27 +153,32 @@ def as_pipeline(*components):
 @pytest.mark.parametrize(
     "classifier_params, data_path, output_length, output_should_sum_to_1",
     [
-        ({"random_seed": 42}, "data/test/many_intents.md", 10, True),  # default config
         (
-            {RANDOM_SEED: 42, RANKING_LENGTH: 0},
+            {RANDOM_SEED: 42, EPOCHS: 3},
+            "data/test/many_intents.md",
+            10,
+            True,
+        ),  # default config
+        (
+            {RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 3},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # no normalization
         (
-            {RANDOM_SEED: 42, RANKING_LENGTH: 3},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 3, EPOCHS: 3},
             "data/test/many_intents.md",
             3,
             True,
         ),  # lower than default ranking_length
         (
-            {RANDOM_SEED: 42, RANKING_LENGTH: 12},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 12, EPOCHS: 3},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # higher than default ranking_length
         (
-            {RANDOM_SEED: 42},
+            {RANDOM_SEED: 42, EPOCHS: 3},
             "examples/moodbot/data/nlu.md",
             7,
             True,
@@ -218,7 +225,7 @@ async def test_softmax_normalization(
 
 @pytest.mark.parametrize(
     "classifier_params, output_length",
-    [({LOSS_TYPE: "margin", RANDOM_SEED: 42}, LABEL_RANKING_LENGTH)],
+    [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 3}, LABEL_RANKING_LENGTH)],
 )
 async def test_margin_loss_is_not_normalized(
     monkeypatch, component_builder, tmpdir, classifier_params, output_length
diff --git a/tests/nlu/base/test_classifiers.py b/tests/nlu/classifiers/test_keyword_classifier.py
similarity index 99%
rename from tests/nlu/base/test_classifiers.py
rename to tests/nlu/classifiers/test_keyword_classifier.py
index 45fcc27fd152..fc7b203a9e70 100644
--- a/tests/nlu/base/test_classifiers.py
+++ b/tests/nlu/classifiers/test_keyword_classifier.py
@@ -1,6 +1,5 @@
 import pytest
 import copy
-import logging
 
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 8327c4572774..43964137feb8 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -5,6 +5,7 @@
 
 from rasa.nlu import config, train
 from rasa.nlu.components import ComponentBuilder
+from rasa.utils.tensorflow.constants import EPOCHS
 
 CONFIG_DEFAULTS_PATH = "sample_configs/config_defaults.yml"
 
@@ -41,7 +42,8 @@ def ner_crf_pos_feature_config():
             ["low", "title", "upper", "pos", "pos2"],
             ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
             ["low", "title", "upper", "pos", "pos2"],
-        ]
+        ],
+        EPOCHS: 3,
     }
 
 

From 4477ba832f88ae3d0335d6a02a9c6bd399c6bce8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 10:12:36 +0100
Subject: [PATCH 364/633] fix tests and types

---
 rasa/nlu/classifiers/diet_classifier.py       |  6 ++--
 .../embedding_intent_classifier.py            |  8 ++---
 rasa/nlu/extractors/crf_entity_extractor.py   |  8 ++---
 rasa/nlu/selectors/response_selector.py       | 32 ++++++++++++++++---
 tests/cli/test_rasa_test.py                   | 11 ++++---
 5 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 41bcfbd578b1..c1ec718ee594 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1281,9 +1281,9 @@ def _entity_loss(
 
         # should call first to build weights
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
-        loss = self._tf_layers["crf"].loss(
-            logits, c, sequence_lengths
-        )  # pytype: disable=attribute-error
+        # pytype: disable=attribute-error
+        loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
+        # pytype: enable=attribute-error
 
         # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index da2584d7cd73..1828a65f5cc8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -4,11 +4,7 @@
 from rasa.constants import DOCS_BASE_URL
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
-from rasa.nlu.constants import (
-    TEXT,
-    DENSE_FEATURE_NAMES,
-    SPARSE_FEATURE_NAMES,
-)
+from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -125,7 +121,7 @@ def __init__(
 
         component_config = component_config or {}
 
-        # the following properties are fixed for the EmbeddingIntentClassifier
+        # the following properties cannot be adapted for the EmbeddingIntentClassifier
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[MASKED_LM] = False
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index fdfe5ced5602..49c4df99d067 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -11,11 +11,7 @@
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.constants import DOCS_BASE_URL
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
-from rasa.nlu.constants import (
-    TEXT,
-    ENTITIES,
-    TOKENS_NAMES,
-)
+from rasa.nlu.constants import TEXT, ENTITIES, TOKENS_NAMES
 from rasa.utils.tensorflow.constants import (
     HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
@@ -122,7 +118,7 @@ def __init__(
     ) -> None:
         component_config = component_config or {}
 
-        # the following properties are fixed for the CRFEntityExtractor
+        # the following properties cannot be adapted for the CRFEntityExtractor
         component_config[INTENT_CLASSIFICATION] = False
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 3732d0ff69b0..25123eb3110b 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -85,7 +85,7 @@ class ResponseSelector(DIETClassifier):
 
     requires = [
         any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT]),
-        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE],),
+        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
     ]
 
     # default properties (DOC MARKER - don't remove)
@@ -175,6 +175,30 @@ class ResponseSelector(DIETClassifier):
     }
     # end default properties (DOC MARKER - don't remove)
 
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+    ) -> None:
+
+        component_config = component_config or {}
+
+        # the following properties cannot be adapted for the ResponseSelector
+        component_config[INTENT_CLASSIFICATION] = True
+        component_config[ENTITY_RECOGNITION] = False
+        component_config[BILOU_FLAG] = False
+
+        super().__init__(
+            component_config,
+            inverted_label_dict,
+            inverted_tag_dict,
+            model,
+            batch_tuple_sizes,
+        )
+
     @property
     def label_key(self) -> Text:
         return "label_ids"
@@ -224,7 +248,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         )
 
         model_data = self._create_model_data(
-            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE,
+            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE
         )
 
         self.check_input_dimension_consistency(model_data)
@@ -306,7 +330,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         sequence_lengths_label = self._get_sequence_lengths(mask_label)
 
         label_transformed, _, _, _ = self._create_sequence(
-            self.tf_label_data["label_features"], mask_label, self.label_name,
+            self.tf_label_data["label_features"], mask_label, self.label_name
         )
         cls_label = self._last_token(label_transformed, sequence_lengths_label)
 
@@ -339,7 +363,7 @@ def batch_loss(
         sequence_lengths_label = self._get_sequence_lengths(mask_label)
 
         label_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["label_features"], mask_label, self.label_name,
+            tf_batch_data["label_features"], mask_label, self.label_name
         )
 
         losses = []
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index ac9a6602edb0..fa21e22c76f3 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -58,19 +58,22 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
     assert os.path.exists("results/confmat.png")
 
 
-def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
+def test_test_nlu_comparison(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     copyfile("config.yml", "config-1.yml")
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "test",
         "nlu",
-        "-config",
+        "--config",
         "config.yml",
         "config-1.yml",
         "--run",
         "2",
-        "-percentages",
+        "--percentages",
         "75",
+        "25",
     )
 
     assert os.path.exists("results/run_1")

From dd5bd2fea406dcde230c888b3dcaf4348bc82b1d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 12 Feb 2020 11:24:50 +0100
Subject: [PATCH 365/633] remove entity f1 todo

---
 rasa/nlu/classifiers/diet_classifier.py | 27 ++++++++++++-------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c1ec718ee594..88762df951ee 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1270,32 +1270,31 @@ def _label_loss(
         )
 
     def _entity_loss(
-        self, a: tf.Tensor, c: tf.Tensor, mask: tf.Tensor, sequence_lengths
+        self, a: tf.Tensor, tag_ids: tf.Tensor, mask: tf.Tensor, sequence_lengths
     ) -> Tuple[tf.Tensor, tf.Tensor]:
 
-        # remove cls token
-        sequence_lengths = sequence_lengths - 1
-        c = tf.cast(c[:, :, 0], tf.int32)
-
+        sequence_lengths = sequence_lengths - 1  # remove cls token
+        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
         logits = self._tf_layers["embed.logits"](a)
 
         # should call first to build weights
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
         # pytype: disable=attribute-error
-        loss = self._tf_layers["crf"].loss(logits, c, sequence_lengths)
+        loss = self._tf_layers["crf"].loss(logits, tag_ids, sequence_lengths)
         # pytype: enable=attribute-error
 
-        # TODO check that f1 calculation is correct
         # calculate f1 score for train predictions
         mask_bool = tf.cast(mask[:, :, 0], tf.bool)
         # pick only non padding values and flatten sequences
-        c_masked = tf.boolean_mask(c, mask_bool)
-        pred_ids_masked = tf.boolean_mask(pred_ids, mask_bool)
+        tag_ids_flat = tf.boolean_mask(tag_ids, mask_bool)
+        pred_ids_flat = tf.boolean_mask(pred_ids, mask_bool)
         # set `0` prediction to not a prediction
-        c_masked_1 = tf.one_hot(c_masked - 1, self._num_tags - 1)
-        pred_ids_masked_1 = tf.one_hot(pred_ids_masked - 1, self._num_tags - 1)
+        tag_ids_flat_one_hot = tf.one_hot(tag_ids_flat - 1, self._num_tags - 1)
+        pred_ids_flat_one_hot = tf.one_hot(pred_ids_flat - 1, self._num_tags - 1)
 
-        f1 = self._tf_layers["crf_f1_score"](c_masked_1, pred_ids_masked_1)
+        f1 = self._tf_layers["crf_f1_score"](
+            tag_ids_flat_one_hot, pred_ids_flat_one_hot
+        )
 
         return loss, f1
 
@@ -1346,10 +1345,10 @@ def batch_loss(
             losses.append(loss)
 
         if self.config[ENTITY_RECOGNITION]:
-            tags = tf_batch_data["tag_ids"][0]
+            tag_ids = tf_batch_data["tag_ids"][0]
 
             loss, f1 = self._entity_loss(
-                text_transformed, tags, mask_text, sequence_lengths
+                text_transformed, tag_ids, mask_text, sequence_lengths
             )
             self.entity_loss.update_state(loss)
             self.entity_f1.update_state(f1)

From 270b0b240c123723ac7e86fab9f6d21215b02431 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 11:41:13 +0100
Subject: [PATCH 366/633] fix crf entity extractor test

---
 tests/nlu/conftest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 43964137feb8..8eed2885117c 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -42,8 +42,7 @@ def ner_crf_pos_feature_config():
             ["low", "title", "upper", "pos", "pos2"],
             ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
             ["low", "title", "upper", "pos", "pos2"],
-        ],
-        EPOCHS: 3,
+        ]
     }
 
 

From b88ed3e873b17ae03116086da50346f2d1bff920 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 13:40:19 +0100
Subject: [PATCH 367/633] refactored variable names

---
 tests/nlu/featurizers/test_lm_featurizer.py | 10 +++++-----
 tests/nlu/tokenizers/test_lm_tokenizer.py   | 10 ++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
index ac2ec662d72c..119da8622d9c 100644
--- a/tests/nlu/featurizers/test_lm_featurizer.py
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -5,11 +5,11 @@
 from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    RESPONSE,
+    INTENT,
     LANGUAGE_MODEL_DOCS,
 )
 from rasa.nlu.training_data import Message
@@ -206,7 +206,7 @@ def test_lm_featurizer_shape_values(
 
     for index in range(len(texts)):
 
-        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT])
         computed_sequence_vec, computed_sentence_vec = (
             computed_feature_vec[:-1],
             computed_feature_vec[-1],
@@ -226,6 +226,6 @@ def test_lm_featurizer_shape_values(
             computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
         )
 
-        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT])
 
         assert intent_vec is None
diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py
index 50fb8cd60370..435c3341a2aa 100644
--- a/tests/nlu/tokenizers/test_lm_tokenizer.py
+++ b/tests/nlu/tokenizers/test_lm_tokenizer.py
@@ -1,7 +1,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 
@@ -306,7 +306,7 @@ def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_in
 
         message = Message.build(text=text)
         transformers_nlp.process(message)
-        tokens = lm_tokenizer.tokenize(message, TEXT_ATTRIBUTE)
+        tokens = lm_tokenizer.tokenize(message, TEXT)
 
         assert [t.text for t in tokens] == gt_tokens
         assert [t.start for t in tokens] == [i[0] for i in gt_indices]
@@ -330,13 +330,11 @@ def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
     lm_tokenizer = LanguageModelTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     td = TrainingData([message])
 
     transformers_nlp.train(td)
     lm_tokenizer.train(td)
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens

From 4bfb4ed180b7a0f8078e9f66b55ea46f42aa230e Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 14:58:40 +0100
Subject: [PATCH 368/633] removed unnecessary component from a test pipeline

---
 tests/nlu/training/test_train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 64fd01f928c3..7981da09c6b8 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -80,7 +80,6 @@ def pipelines_for_tests():
             as_pipeline(
                 "ConveRTTokenizer",
                 "ConveRTFeaturizer",
-                "LanguageModelFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
                 "CRFEntityExtractor",

From 62c9b5f28b5077292cc001bcd1a1d7e02077b58f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 12 Feb 2020 15:13:00 +0100
Subject: [PATCH 369/633] fix output of validation results

---
 rasa/utils/tensorflow/models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index f68e03fc27ac..77e67fddb739 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -81,6 +81,7 @@ def fit(
             eager, evaluate_on_num_examples, evaluation_model_data
         )
 
+        val_results = {}  # validation is not performed every epoch
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
         for ep in pbar:
@@ -105,9 +106,9 @@ def fit(
                         ep_batch_size,
                         False,
                     )
+                    val_results = self._get_metric_results(prefix="val_")
 
-                # Get the metric results
-                postfix_dict.update(self._get_metric_results(prefix="val_"))
+                postfix_dict.update(val_results)
 
             pbar.set_postfix(postfix_dict)
 

From 0f8b8a5a9f2d3e3f8bc7abff328b5341c3c158d0 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 15:53:20 +0100
Subject: [PATCH 370/633] added test and refactored env variables

---
 rasa/__main__.py                     |  2 +-
 rasa/constants.py                    |  4 ++
 rasa/utils/tensorflow/__init__.py    | 67 ------------------------
 rasa/utils/tensorflow/environment.py | 76 ++++++++++++++++++++++++++++
 tests/utils/test_tf_environment.py   | 16 ++++++
 5 files changed, 97 insertions(+), 68 deletions(-)
 create mode 100644 rasa/utils/tensorflow/environment.py
 create mode 100644 tests/utils/test_tf_environment.py

diff --git a/rasa/__main__.py b/rasa/__main__.py
index 9c0017d4d383..482807f1f261 100644
--- a/rasa/__main__.py
+++ b/rasa/__main__.py
@@ -8,7 +8,7 @@
 from rasa.cli.arguments.default_arguments import add_logging_options
 from rasa.cli.utils import parse_last_positional_argument_as_model_path
 from rasa.utils.common import set_log_level
-from rasa.utils.tensorflow import setup_tf_environment
+from rasa.utils.tensorflow.environment import setup_tf_environment
 
 logger = logging.getLogger(__name__)
 
diff --git a/rasa/constants.py b/rasa/constants.py
index edfae6226e74..7cb9a9936507 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -59,3 +59,7 @@
 
 DEFAULT_SESSION_EXPIRATION_TIME_IN_MINUTES = 60
 DEFAULT_CARRY_OVER_SLOTS_TO_NEW_SESSION = True
+
+ENV_GPU_CONFIG = "TF_GPU_MEMORY_ALLOC"
+ENV_CPU_INTER_OP_CONFIG = "TF_INTER_OP_PARALLELISM_THREADS"
+ENV_CPU_INTRA_OP_CONFIG = "TF_INTRA_OP_PARALLELISM_THREADS"
diff --git a/rasa/utils/tensorflow/__init__.py b/rasa/utils/tensorflow/__init__.py
index 5bb0a5ee9ca8..b28b04f64312 100644
--- a/rasa/utils/tensorflow/__init__.py
+++ b/rasa/utils/tensorflow/__init__.py
@@ -1,70 +1,3 @@
-import os
-import tensorflow as tf
-import logging
 
-logger = logging.getLogger(__name__)
 
 
-def setup_gpu_environment(gpu_memory_config):
-
-    if gpu_memory_config:
-
-        # Parse GPU config
-        # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
-        # Parse it and store in a dictionary
-        parsed_gpu_config = {
-            instance.split(":")[0].strip(): int(instance.split(":")[1].strip())
-            for instance in gpu_memory_config.split(",")
-        }
-
-        physical_gpus = tf.config.list_physical_devices("GPU")
-
-        # Logic taken from https://www.tensorflow.org/guide/gpu
-        if physical_gpus:
-
-            for gpu_id, gpu_id_memory in parsed_gpu_config.items():
-                try:
-                    tf.config.experimental.set_virtual_device_configuration(
-                        physical_gpus[int(gpu_id)],
-                        [
-                            tf.config.experimental.VirtualDeviceConfiguration(
-                                memory_limit=gpu_id_memory
-                            )
-                        ],
-                    )
-
-                except RuntimeError as e:
-                    # Virtual devices must be set before GPUs have been initialized
-                    raise RuntimeError(
-                        "Error while setting up tensorflow environment. "
-                        "Virtual devices must be set before GPUs have been initialized"
-                    )
-
-        else:
-            logger.info(
-                "You have an environment variable GPU_MEMORY_ALLOC set but no GPUs were detected to configure"
-            )
-
-
-def setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads):
-
-    if inter_op_parallel_threads:
-        tf.config.threading.set_inter_op_parallelism_threads(
-            int(inter_op_parallel_threads.strip())
-        )
-
-    if intra_op_parallel_threads:
-        tf.config.threading.set_intra_op_parallelism_threads(
-            int(intra_op_parallel_threads.strip())
-        )
-
-
-def setup_tf_environment():
-
-    # Get all env variables
-    gpu_memory_config = os.getenv("TF_GPU_MEMORY_ALLOC", None)
-    inter_op_parallel_threads = os.getenv("TF_INTER_OP_PARALLELISM_THREADS", None)
-    intra_op_parallel_threads = os.getenv("TF_INTRA_OP_PARALLELISM_THREADS", None)
-
-    setup_gpu_environment(gpu_memory_config)
-    setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads)
diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
new file mode 100644
index 000000000000..acab6c726176
--- /dev/null
+++ b/rasa/utils/tensorflow/environment.py
@@ -0,0 +1,76 @@
+import logging
+import os
+
+import tensorflow as tf
+from rasa.constants import (
+    ENV_GPU_CONFIG,
+    ENV_CPU_INTER_OP_CONFIG,
+    ENV_CPU_INTRA_OP_CONFIG,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def setup_gpu_environment(gpu_memory_config):
+
+    if gpu_memory_config:
+
+        # Parse GPU config
+        # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
+        # Parse it and store in a dictionary
+        parsed_gpu_config = {
+            instance.split(":")[0].strip(): int(instance.split(":")[1].strip())
+            for instance in gpu_memory_config.split(",")
+        }
+
+        physical_gpus = tf.config.list_physical_devices("GPU")
+
+        # Logic taken from https://www.tensorflow.org/guide/gpu
+        if physical_gpus:
+
+            for gpu_id, gpu_id_memory in parsed_gpu_config.items():
+                try:
+                    tf.config.experimental.set_virtual_device_configuration(
+                        physical_gpus[int(gpu_id)],
+                        [
+                            tf.config.experimental.VirtualDeviceConfiguration(
+                                memory_limit=gpu_id_memory
+                            )
+                        ],
+                    )
+
+                except RuntimeError as e:
+                    # Virtual devices must be set before GPUs have been initialized
+                    raise RuntimeError(
+                        "Error while setting up tensorflow environment. "
+                        "Virtual devices must be set before GPUs have been initialized"
+                    )
+
+        else:
+            logger.info(
+                "You have an environment variable GPU_MEMORY_ALLOC set but no GPUs were detected to configure"
+            )
+
+
+def setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads):
+
+    if inter_op_parallel_threads:
+        tf.config.threading.set_inter_op_parallelism_threads(
+            int(inter_op_parallel_threads.strip())
+        )
+
+    if intra_op_parallel_threads:
+        tf.config.threading.set_intra_op_parallelism_threads(
+            int(intra_op_parallel_threads.strip())
+        )
+
+
+def setup_tf_environment():
+
+    # Get all env variables
+    gpu_memory_config = os.getenv(ENV_GPU_CONFIG, None)
+    inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG, None)
+    intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG, None)
+
+    setup_gpu_environment(gpu_memory_config)
+    setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads)
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
new file mode 100644
index 000000000000..acb58902a815
--- /dev/null
+++ b/tests/utils/test_tf_environment.py
@@ -0,0 +1,16 @@
+import tensorflow as tf
+from rasa.utils.tensorflow.environment import (
+    setup_cpu_environment,
+    setup_gpu_environment,
+)
+
+
+def test_tf_cpu_environment_setting():
+
+    inter_op_threads = "2"
+    intra_op_threads = "3"
+
+    setup_cpu_environment(inter_op_threads, intra_op_threads)
+
+    assert tf.config.threading.get_inter_op_parallelism_threads() == 2
+    assert tf.config.threading.get_intra_op_parallelism_threads() == 3

From 3deda46aa51757780c069bccc079264a03fabd35 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:04:16 +0100
Subject: [PATCH 371/633] remove extra lines

---
 rasa/utils/tensorflow/__init__.py  | 3 ---
 tests/utils/test_tf_environment.py | 5 +----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/rasa/utils/tensorflow/__init__.py b/rasa/utils/tensorflow/__init__.py
index b28b04f64312..e69de29bb2d1 100644
--- a/rasa/utils/tensorflow/__init__.py
+++ b/rasa/utils/tensorflow/__init__.py
@@ -1,3 +0,0 @@
-
-
-
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index acb58902a815..14c0d2ba2e98 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -1,8 +1,5 @@
 import tensorflow as tf
-from rasa.utils.tensorflow.environment import (
-    setup_cpu_environment,
-    setup_gpu_environment,
-)
+from rasa.utils.tensorflow.environment import setup_cpu_environment
 
 
 def test_tf_cpu_environment_setting():

From 75663a0baa0cce3184389d1c5ee0f413f2c69aad Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:10:12 +0100
Subject: [PATCH 372/633] added changelog

---
 changelog/5230.feature.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 changelog/5230.feature.rst

diff --git a/changelog/5230.feature.rst b/changelog/5230.feature.rst
new file mode 100644
index 000000000000..5d7599047663
--- /dev/null
+++ b/changelog/5230.feature.rst
@@ -0,0 +1,15 @@
+Refactor how GPU and CPU environments are configured for tensorflow 2.0
+
+Environment variables to set and description is shown in the example below:
+
+.. code-block:: python
+
+    # This specifies to use 1024 MB of memory from GPU with logical ID 0 and 2048 MB of memory from GPU with logical ID 1
+    TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"
+
+    # Specifies that atmost 3 CPU threads can be used to parallelize multiple non-blocking operations
+    TF_INTER_OP_PARALLELISM_THREADS="3"
+
+    # Specifies that atmost 2 CPU threads can be used to parallelize a particular operation.
+    TF_INTRA_OP_PARALLELISM_THREADS="2"
+

From 1ba52e09909c8a78ec239f9afb851222949be708 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:12:39 +0100
Subject: [PATCH 373/633] removed unref variable

---
 rasa/utils/tensorflow/environment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index acab6c726176..2afe9ba056c8 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -39,7 +39,7 @@ def setup_gpu_environment(gpu_memory_config):
                         ],
                     )
 
-                except RuntimeError as e:
+                except RuntimeError:
                     # Virtual devices must be set before GPUs have been initialized
                     raise RuntimeError(
                         "Error while setting up tensorflow environment. "

From 9008960f510b99c3460f0ffb32da1456a9bdc454 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:17:40 +0100
Subject: [PATCH 374/633] add types

---
 rasa/utils/tensorflow/environment.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 2afe9ba056c8..eb3401a23a23 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Text
 
 import tensorflow as tf
 from rasa.constants import (
@@ -11,7 +12,7 @@
 logger = logging.getLogger(__name__)
 
 
-def setup_gpu_environment(gpu_memory_config):
+def setup_gpu_environment(gpu_memory_config: Text) -> None:
 
     if gpu_memory_config:
 
@@ -52,7 +53,9 @@ def setup_gpu_environment(gpu_memory_config):
             )
 
 
-def setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads):
+def setup_cpu_environment(
+    inter_op_parallel_threads: Text, intra_op_parallel_threads: Text
+) -> None:
 
     if inter_op_parallel_threads:
         tf.config.threading.set_inter_op_parallelism_threads(

From cb7d195162313d5d4925ab6384bbe93b950f654b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:39:12 +0100
Subject: [PATCH 375/633] added random seed and number of epochs to crf
 extractor test

---
 tests/nlu/conftest.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 8eed2885117c..27ab3a5ccc13 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -5,7 +5,7 @@
 
 from rasa.nlu import config, train
 from rasa.nlu.components import ComponentBuilder
-from rasa.utils.tensorflow.constants import EPOCHS
+from rasa.utils.tensorflow.constants import EPOCHS, RANDOM_SEED
 
 CONFIG_DEFAULTS_PATH = "sample_configs/config_defaults.yml"
 
@@ -42,7 +42,9 @@ def ner_crf_pos_feature_config():
             ["low", "title", "upper", "pos", "pos2"],
             ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
             ["low", "title", "upper", "pos", "pos2"],
-        ]
+        ],
+        EPOCHS: 100,
+        RANDOM_SEED: 2020,
     }
 
 

From 32c2ead2e59aaf9852686f611b96ad3491c31642 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:40:04 +0100
Subject: [PATCH 376/633] added random seed and number of epochs to crf
 extractor test

---
 tests/nlu/extractors/test_crf_entity_extractor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index 98a74b6088e1..f90ac8cdf1d7 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -13,7 +13,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
                 "entities": [
                     {"start": 16, "end": 20, "value": "west", "entity": "location"}
                 ],
-                "spacy_doc": spacy_nlp("anywhere in the west"),
+                SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"),
             },
         ),
         Message(
@@ -36,7 +36,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
                         "extractor": "CRFEntityExtractor",
                     },
                 ],
-                "spacy_doc": spacy_nlp("central indian restaurant"),
+                SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"),
             },
         ),
     ]

From 5ec8d3edff27e21c9529659310e7da4de7b9157f Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 16:51:43 +0100
Subject: [PATCH 377/633] remove old tests

---
 rasa/utils/tensorflow/environment.py |  2 +-
 tests/core/test_policies.py          | 73 ----------------------------
 2 files changed, 1 insertion(+), 74 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index eb3401a23a23..e015a5e3aec3 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -41,7 +41,7 @@ def setup_gpu_environment(gpu_memory_config: Text) -> None:
                     )
 
                 except RuntimeError:
-                    # Virtual devices must be set before GPUs have been initialized
+                    # Add a helper explanation where the error comes from
                     raise RuntimeError(
                         "Error while setting up tensorflow environment. "
                         "Virtual devices must be set before GPUs have been initialized"
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index d39a6e741a77..aaffba0c3669 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -52,35 +52,6 @@
 from tests.core.utilities import get_tracker, read_dialogue_file, user_uttered
 
 
-def tf_defaults():
-    return {
-        "tf_config": {
-            "device_count": {"CPU": 4},
-            # tell tf.Session to use CPU limit, if you have
-            # more CPU, you can increase this value appropriately
-            "inter_op_parallelism_threads": 0,
-            # the number of threads in the thread pool available
-            # for each process for blocking operation nodes set to 0
-            # to allow the system to select the appropriate value.
-            "intra_op_parallelism_threads": 0,  # tells the degree of thread
-            # parallelism of the tf.Session operation.
-            # the smaller the value, the less reuse the thread will have
-            # and the more likely it will use more CPU cores.
-            # if the value is 0,
-            # tensorflow will automatically select an appropriate value.
-            "gpu_options": {"allow_growth": True}
-            # if set True, will try to allocate
-            # as much GPU memory as possible to support running
-        }
-    }
-
-
-def session_config():
-    import tensorflow as tf
-
-    return tf.ConfigProto(**tf_defaults()["tf_config"])
-
-
 async def train_trackers(domain, augmentation_factor=20):
     return await training.load_data(
         DEFAULT_STORIES_FILE, domain, augmentation_factor=augmentation_factor
@@ -185,18 +156,6 @@ def test_persist_and_load_empty_policy(self, tmpdir):
         loaded = empty_policy.__class__.load(tmpdir.strpath)
         assert loaded is not None
 
-    # TODO test tf config
-    # def test_tf_config(self, trained_policy, tmpdir):
-    #     if hasattr(trained_policy, "session"):
-    #         import tensorflow as tf
-    #
-    #         # noinspection PyProtectedMember
-    #         assert trained_policy.session._config == tf.Session()._config
-    #         trained_policy.persist(tmpdir.strpath)
-    #         loaded = trained_policy.__class__.load(tmpdir.strpath)
-    #         # noinspection PyProtectedMember
-    #         assert loaded.session._config == tf.Session()._config
-
     @staticmethod
     def _get_next_action(policy, events, domain):
         tracker = get_tracker(events)
@@ -212,22 +171,6 @@ def create_policy(self, featurizer, priority):
         return p
 
 
-class TestKerasPolicyWithTfConfig(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        p = KerasPolicy(featurizer, priority, **tf_defaults())
-        return p
-
-    # TODO fix and test tf config
-    @pytest.mark.skip(reason="We need to fix tf.config!")
-    def test_tf_config(self, trained_policy, tmpdir):
-        # noinspection PyProtectedMember
-        assert trained_policy.session._config == session_config()
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
-        # noinspection PyProtectedMember
-        assert loaded.session._config == session_config()
-
-
 class TestSklearnPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority, **kwargs):
         p = SklearnPolicy(featurizer, priority, **kwargs)
@@ -529,22 +472,6 @@ def create_policy(self, featurizer, priority):
         return p
 
 
-class TestTEDPolicyWithTfConfig(TestTEDPolicy):
-    def create_policy(self, featurizer, priority):
-        p = TEDPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
-        return p
-
-    # TODO test tf config
-    @pytest.mark.skip(reason="Fix tf config.")
-    def test_tf_config(self, trained_policy, tmpdir):
-        # noinspection PyProtectedMember
-        assert trained_policy.session._config == session_config()
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
-        # noinspection PyProtectedMember
-        assert loaded.session._config == session_config()
-
-
 class TestMemoizationPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
         max_history = None

From 86ee3375e79d1ceccadf0d2b33a41e4bf18933c4 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 17:02:41 +0100
Subject: [PATCH 378/633] added constants

---
 rasa/nlu/constants.py                         |  5 +++++
 .../dense_featurizer/lm_featurizer.py         |  6 ++++--
 rasa/nlu/tokenizers/lm_tokenizer.py           |  3 ++-
 .../nlu/utils/hugging_face/hf_transformers.py | 21 ++++++++++++-------
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 833d169341b9..16f1b261b242 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -36,6 +36,11 @@
     RESPONSE: "response_language_model_doc",
 }
 
+TOKEN_IDS = "token_ids"
+TOKENS = "tokens"
+SEQUENCE_FEATURES = "sequence_features"
+SENTENCE_FEATURES = "sentence_features"
+
 SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"}
 
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 11213916312b..eef8f16d1ac8 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -11,6 +11,8 @@
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
     TOKENS_NAMES,
+    SEQUENCE_FEATURES,
+    SENTENCE_FEATURES,
 )
 
 
@@ -49,8 +51,8 @@ def _set_lm_features(self, message: Message, attribute: Text = TEXT):
         doc = self.get_doc(message, attribute)
 
         if doc is not None:
-            sequence_features = doc["sequence_features"]
-            sentence_features = doc["sentence_features"]
+            sequence_features = doc[SEQUENCE_FEATURES]
+            sentence_features = doc[SENTENCE_FEATURES]
 
             features = np.concatenate([sequence_features, sentence_features])
 
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index 21d750a13cef..0a12e65576c1 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -8,6 +8,7 @@
     LANGUAGE_MODEL_DOCS,
     DENSE_FEATURIZABLE_ATTRIBUTES,
     MESSAGE_ATTRIBUTES,
+    TOKENS,
 )
 
 
@@ -32,4 +33,4 @@ def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         doc = self.get_doc(message, attribute)
 
-        return doc["tokens"]
+        return doc[TOKENS]
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 846efc532179..4094f5313f98 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -17,11 +17,18 @@
     model_embeddings_post_processors,
     model_tokens_cleaners,
 )
+from rasa.nlu.constants import (
+    TEXT,
+    LANGUAGE_MODEL_DOCS,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    TOKEN_IDS,
+    TOKENS,
+    SENTENCE_FEATURES,
+    SEQUENCE_FEATURES,
+)
 
 logger = logging.getLogger(__name__)
 
-from rasa.nlu.constants import TEXT, LANGUAGE_MODEL_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
-
 
 class HFTransformersNLP(Component):
     provides = [
@@ -280,12 +287,10 @@ def _get_docs_for_batch(
         batch_docs = []
         for index in range(len(batch_examples)):
             doc = {
-                "token_ids": batch_token_ids[index],
-                "tokens": batch_tokens[index],
-                "sequence_features": batch_sequence_features[index],
-                "sentence_features": np.reshape(
-                    batch_sentence_features[index], (1, -1)
-                ),
+                TOKEN_IDS: batch_token_ids[index],
+                TOKENS: batch_tokens[index],
+                SEQUENCE_FEATURES: batch_sequence_features[index],
+                SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)),
             }
             batch_docs.append(doc)
 

From 142327ad97336b457242f462ad2e438c153a97f9 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 12 Feb 2020 18:12:21 +0100
Subject: [PATCH 379/633] added docs

---
 docs/api/tensorflow_usage.rst | 58 +++++++++++++++++++++++++++++++++++
 docs/index.rst                |  1 +
 2 files changed, 59 insertions(+)
 create mode 100644 docs/api/tensorflow_usage.rst

diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
new file mode 100644
index 000000000000..8ecc0473a677
--- /dev/null
+++ b/docs/api/tensorflow_usage.rst
@@ -0,0 +1,58 @@
+:desc: Find out how to configure your environment for efficient usage of tensorflow inside Rasa
+
+.. _tensorflow_usage:
+
+Setting up Tensorflow Runtime
+=============================
+
+Tensorflow allows setting the runtime environment via
+`TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa supports a smaller subset of these
+configuration options and makes appropriate calls to the `tf.config` submodule.
+This smaller subset comprises of configurations that developers frequently use with Rasa.
+All configuration options are specified using environment variables as shown in subsequent sections.
+
+
+Optimize CPU Performance
+------------------------
+
+Parallelize one operation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
+to parallelize the execution of one operation. If left unspecified, this value defaults to 0 which means tensorflow should
+pick an appropriate value depending on the system configuration.
+
+
+Parallelize multiple operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
+to parallelize the execution of multiple **non-blocking** operations. If left unspecified, this value defaults to 0
+which means tensorflow should pick an appropriate value depending on the system configuration.
+
+
+Optimize GPU Performance
+------------------------
+
+Limiting GPU memory growth
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Tensorflow by default blocks all the available GPU memory for the running process. This can be limiting if you are running
+multiple tensorflow processes and want to distribute memory across them. To prevent this,
+set an environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
+
+
+Restricting absolute GPU memory available
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Often, a developer wants to limit the absolute amount of GPU memory that can be used by a process.
+
+For example, you may have two visible GPUs(GPU:0 and GPU:1) and you want to allocate 1024 MB from first GPU and 2048 MB from second GPU.
+You can do so by setting an environment variable as ``TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"``.
+
+Another scenario can be where you have access to 2 GPUs(GPU:0 and GPU:1) but you would like to use only second GPU for
+Rasa process.
+``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory from GPU 1 available for the Rasa process
+
+
+
diff --git a/docs/index.rst b/docs/index.rst
index 150317568b7a..1074a8dcf694 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -93,6 +93,7 @@ Understand messages, hold conversations, and connect to messaging channels and A
    api/lock-stores
    api/training-data-importers
    api/core-featurization
+   api/tensorflow_usage
    migration-guide
    changelog
 

From 904df4936dfda18178202488785a040623ff8b81 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 12 Feb 2020 21:18:39 +0100
Subject: [PATCH 380/633] update docs

---
 docs/core/policies.rst                        | 173 +++--
 docs/migration-guide.rst                      |  96 ++-
 docs/nlu/components.rst                       | 634 +++++++++++++-----
 rasa/core/policies/embedding_policy.py        |  12 +-
 rasa/core/policies/ted_policy.py              |  32 +-
 rasa/nlu/classifiers/diet_classifier.py       |  42 +-
 .../embedding_intent_classifier.py            |  15 +-
 rasa/nlu/extractors/crf_entity_extractor.py   |   7 +-
 rasa/nlu/selectors/response_selector.py       |  33 +-
 rasa/utils/tensorflow/constants.py            |  15 +-
 rasa/utils/train_utils.py                     |  14 +-
 11 files changed, 772 insertions(+), 301 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index def75b9d9f22..83b636f60360 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -148,7 +148,7 @@ expected outcome in the case of a tie. They look like this, where higher numbers
     | 4. ``FallbackPolicy`` and ``TwoStageFallbackPolicy``
     | 3. ``MemoizationPolicy`` and ``AugmentedMemoizationPolicy``
     | 2. ``MappingPolicy``
-    | 1. ``TEDPolicy``, ``KerasPolicy``, and ``SklearnPolicy``
+    | 1. ``TEDPolicy``, ``EmbeddingPolicy``, ``KerasPolicy``, and ``SklearnPolicy``
 
 This priority hierarchy ensures that, for example, if there is an intent with a mapped action, but the NLU confidence is not
 above the ``nlu_threshold``, the bot will still fall back. In general, it is not recommended to have more
@@ -197,8 +197,10 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 Embedding Policy
 ^^^^^^^^^^^^^^^^
 
-``EmbeddingPolicy`` got renamed to ``TEDPolicy``.
-Please take a look at :ref:`ted_policy` for more details.
+    .. warning::
+
+        ``EmbeddingPolicy`` got renamed to ``TEDPolicy``. Please use :ref:`ted_policy` instead of ``EmbeddingPolicy``.
+        The functionality of the policy stayed the same.
 
 
 .. _ted_policy:
@@ -208,54 +210,46 @@ TED Policy
 
 Transformer Embedding Dialogue Policy (TEDP)
 
-Transformer version of the Recurrent Embedding Dialogue Policy (REDP)
-used in our paper: `<https://arxiv.org/abs/1811.11707>`_
+The policy used in our paper https://arxiv.org/abs/1910.00486.
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
 
-    - concatenate user input (user intent and entities),
-      previous system action, slots and active form
-      for each time step into an input vector
-      to pre-transformer embedding layer;
+    - concatenate user input (user intent and entities), previous system action, slots and active form for each time
+      step into an input vector to pre-transformer embedding layer;
     - feed it to transformer;
-    - apply a dense layer to the output of the transformer
-      to get embeddings of a dialogue for each time step;
+    - apply a dense layer to the output of the transformer to get embeddings of a dialogue for each time step;
     - apply a dense layer to create embeddings for system actions for each time step;
-    - calculate the similarity between the
-      dialogue embedding and embedded system actions.
-      This step is based on the
-      `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
+    - calculate the similarity between the dialogue embedding and embedded system actions.
+      This step is based on the `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
 
-It is recommended to use
-``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
+It is recommended to use ``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
 (see :ref:`featurization_conversations` for details).
 
 **Configuration:**
 
     Configuration parameters can be passed as parameters to the
-    ``TEDPolicy`` within the policy configuration file.
+    ``TEDPolicy`` within the configuration file.
 
     .. warning::
 
-        Pass an appropriate number of ``epochs`` to the ``TEDPolicy``,
-        otherwise the policy will be trained only for ``1``
-        epoch.
+        Pass an appropriate number of ``epochs`` to the ``TEDPolicy``, otherwise the policy will be trained only
+        for ``1`` epoch.
 
     The algorithm also has hyper-parameters to control:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_b`` sets a list of hidden layers
+            - ``hidden_layers_sizes`` sets a list of hidden layers
               sizes before embedding layer for system actions, the number
-              of hidden layers is equal to the length of the list;
-            - ``transformer_size`` sets the number of units in the transfomer;
-            - ``num_transformer_layers`` sets the number of transformer layers;
-            - ``pos_encoding`` sets the type of positional encoding in transformer,
-              it should be either ``timing`` or ``emb``;
-            - ``max_seq_length`` sets maximum sequence length
-              if embedding positional encodings are used;
-            - ``num_heads`` sets the number of heads in multihead attention;
+              of hidden layers is equal to the length of the list.
+            - ``transformer_size`` sets the number of units in the transfomer.
+            - ``number_of_transformer_layers`` sets the number of transformer layers.
+            - ``maximum_sequence_length`` sets maximum sequence length.
+            - ``number_of_attention_heads`` sets the number of heads in multihead attention.
+            - ``use_key_relative_attention`` if true use key relative embeddings in attention.
+            - ``use_value_relative_attention`` if true use key relative embeddings in attention.
+            - ``max_relative_position`` sets the max position for relative embeddings.
 
         - training:
 
@@ -272,8 +266,8 @@ It is recommended to use
 
         - embedding:
 
-            - ``embed_dim`` sets the dimension of embedding space;
-            - ``num_neg`` sets the number of incorrect intent labels,
+            - ``embedding_dimension`` sets the dimension of embedding space;
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels,
               the algorithm will minimize their similarity to the user
               input during training;
             - ``similarity_type`` sets the type of the similarity,
@@ -285,13 +279,13 @@ It is recommended to use
             - ``ranking_length`` defines the number of top confidences over
               which to normalize ranking results if ``loss_type: "softmax"``;
               to turn off normalization set it to 0
-            - ``mu_pos`` controls how similar the algorithm should try
+            - ``maximum_positive_similarity`` controls how similar the algorithm should try
               to make embedding vectors for correct intent labels,
               used only if ``loss_type`` is set to ``margin``;
-            - ``mu_neg`` controls maximum negative similarity for
+            - ``maximum_negative_similarity`` controls maximum negative similarity for
               incorrect intents,
               used only if ``loss_type`` is set to ``margin``;
-            - ``use_max_sim_neg`` if ``true`` the algorithm only
+            - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
               minimizes maximum similarity over incorrect intent labels,
               used only if ``loss_type`` is set to ``margin``;
             - ``scale_loss`` if ``true`` the algorithm will downscale the loss
@@ -300,20 +294,21 @@ It is recommended to use
 
         - regularization:
 
-            - ``C2`` sets the scale of L2 regularization
-            - ``C_emb`` sets the scale of how important is to minimize
+            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``negative_margin_scale`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different
-              intent labels, used only if ``loss_type`` is set to ``margin``;
-            - ``droprate_a`` sets the dropout rate between
-              layers before embedding layer for user inputs;
-            - ``droprate_b`` sets the dropout rate between layers
-              before embedding layer for system actions;
+              intent labels, used only if ``loss_type`` is set to ``margin``.
+            - ``droprate_dialogue`` sets the dropout rate between
+              layers before embedding layer for user inputs.
+            - ``droprate_label`` sets the dropout rate between layers
+              before embedding layer for system actions.
+            - ``droprate_attention`` sets the dropout rate for attention.
 
         - train accuracy calculation:
 
-            - ``evaluate_every_num_epochs`` sets how often to calculate
-              train accuracy, small values may hurt performance;
-            - ``evaluate_on_num_examples`` how many examples to use for
+            - ``evaluate_every_number_of_epochs`` sets how often to calculate
+              train accuracy, small values may hurt performance.
+            - ``evaluate_on_number_of_examples`` how many examples to use for
               hold out validation set to calculate of validation accuracy,
               large values may hurt performance.
 
@@ -328,11 +323,11 @@ It is recommended to use
 
     .. warning::
 
-        If ``evaluate_on_num_examples`` is non zero, random examples will be
+        If ``evaluate_on_number_of_examples`` is non zero, random examples will be
         picked by stratified split and used as **hold out** validation set,
         so they will be excluded from training data.
         We suggest to set it to zero if data set contains a lot of unique examples
-        of dialogue turns
+        of dialogue turns.
 
     .. note::
 
@@ -341,7 +336,7 @@ It is recommended to use
 
     .. note::
 
-        For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should
+        For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
         be between ``-1`` and ``1``.
 
     .. note::
@@ -353,19 +348,85 @@ It is recommended to use
         ``batch_size`` is required, pass an ``int``, e.g.
         ``"batch_size": 8``.
 
-    These parameters can be specified in the policy configuration file.
-    The default values are defined in ``EmbeddingPolicy.defaults``:
+    These parameters can be specified in the configuration file.
+    The following default values are set:
+
+    .. code-block:: yaml
 
-    .. literalinclude:: ../../rasa/core/policies/ted_policy.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
+        # nn architecture
+        # a list of hidden layers sizes before dialogue and action embed layers
+        # number of hidden layers is equal to the length of this list
+        "hidden_layers_sizes": {"dialogue": [], "label": []}
+        # number of units in transformer
+        "transformer_size": 128
+        # number of transformer layers
+        "number_of_transformer_layers": 1
+        # max sequence length
+        "maximum_sequence_length": 256
+        # number of attention heads in transformer
+        "number_of_attention_heads": 4
+        # if true use key relative embeddings in attention
+        "use_key_relative_attention": False
+        # if true use key relative embeddings in attention
+        "use_value_relative_attention": False
+        # max position for relative embeddings
+        "max_relative_position": None
+        # training parameters
+        # initial and final batch sizes:
+        # batch size will be linearly increased for each epoch
+        "batch_size": [8, 32]
+        # how to create batches
+        "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
+        # number of epochs
+        "epochs": 1
+        # set random seed to any int to get reproducible results
+        "random_seed": None
+        # embedding parameters
+        # dimension size of embedding vectors
+        "embedding_dimension": 20
+        # the type of the similarity
+        "number_of_negative_examples": 20
+        # flag if minimize only maximum similarity over incorrect labels
+        "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
+        "loss_type": "softmax"  # string 'softmax' or 'margin'
+        # number of top actions to normalize scores for softmax loss_type
+        # set to 0 to turn off normalization
+        "ranking_length": 10
+        # how similar the algorithm should try
+        # to make embedding vectors for correct labels
+        "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
+        # maximum negative similarity for incorrect labels
+        "maximum_negative_similarity": -0.2  # should be -1.0 < ... < 1.0 for 'cosine'
+        # the number of incorrect labels, the algorithm will minimize
+        # their similarity to the user input during training
+        "use_maximum_negative_similarity": True  # flag which loss function to use
+        # scale loss inverse proportionally to confidence of correct prediction
+        "scale_loss": True
+        # regularization
+        # the scale of regularization
+        "regularization_constant": 0.001
+        # the scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels
+        "negative_margin_scale": 0.8
+        # dropout rate for dial nn
+        "droprate_dialogue": 0.1
+        # dropout rate for bot nn
+        "droprate_label": 0.0
+        # dropout rate for attention
+        "droprate_attention": 0
+        # visualization of accuracy
+        # how often calculate validation accuracy
+        "evaluate_every_number_of_epochs": 20  # small values may hurt performance
+        # how many examples to use for hold out validation set
+        "evaluate_on_number_of_examples": 0  # large values may hurt performance
 
     .. note::
 
-          Parameter ``mu_neg`` is set to a negative value to mimic
+          Parameter ``maximum_negative_similarity`` is set to a negative value to mimic
           the original starspace algorithm in the case
-          ``mu_neg = mu_pos`` and ``use_max_sim_neg = False``. See
+          ``maximum_negative_similarity = maximum_positive_similarity`` and
+          ``use_maximum_negative_similarity = False``. See
           `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
 
 .. _mapping-policy:
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 3dfe3657bda3..967cbc3481e9 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -33,8 +33,100 @@ General
       epochs: 100
 
   The given snippet specifies default values for the parameters ``max_history`` and
-  ``epochs``. ``max_history`` is particularly important and strongly depends on your stories. Please see the docs of the :ref:`embedding_policy` if you want to
-  customize them.
+  ``epochs``. ``max_history`` is particularly important and strongly depends on your stories.
+  Please see the docs of the :ref:`embedding_policy` if you want to customize them.
+
+- The :ref:`embedding_policy` got renamed to :ref:`ted_policy`. The functionality of the policy stayed the same.
+  Please update your configuration files to use ``TEDPolicy`` instead of ``EmbeddingPolicy``.
+
+- Most of the model options for ``EmbeddingPolicy``, ``EmbeddingIntentClassifier``, and ``ResponseSelector`` got
+  renamed. Please update your configuration files using the following mapping:
+
+  =============================  =======================================================
+  Old model option               New model option
+  =============================  =======================================================
+  hidden_layers_sizes_a          dictionary "hidden_layers_sizes" with key "text"
+  hidden_layers_sizes_b          dictionary "hidden_layers_sizes" with key "label"
+  hidden_layers_sizes_pre_dial   dictionary "hidden_layers_sizes" with key "dialogue"
+  hidden_layers_sizes_bot        dictionary "hidden_layers_sizes" with key "label"
+  num_transformer_layers         number_of_transformer_layers
+  num_heads                      number_of_attention_heads
+  max_seq_length                 maximum_sequence_length
+  dense_dim                      dense_dimension
+  embed_dim                      embedding_dimension
+  num_neg                        number_of_negative_examples
+  mu_pos                         maximum_positive_similarity
+  mu_neg                         maximum_negative_similarity
+  use_max_sim_neg                use_maximum_negative_similarity
+  C2                             regularization_constant
+  C_emb                          negative_margin_scale
+  droprate_a                     droprate_dialogue
+  droprate_b                     droprate_label
+  evaluate_every_num_epochs      evaluate_every_number_of_epochs
+  evaluate_on_num_examples       evaluate_on_number_of_examples
+  =============================  =======================================================
+
+  A warning will be logged in case an old option is used.
+
+- ``EmbeddingIntentClassifier`` is now deprecated and will be replaced by ``DIETClassifier`` in the future.
+  ``DIETClassifier`` builds on top the model architecture of the ``EmbeddingIntentClassifier``. ``DIETClassfier``
+  allows you to train one model for entity extraction and intent classification. However, if you want to
+  get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use the following configuration of
+  ``DIETClassifier``:
+
+  .. code-block:: yaml
+
+    pipeline:
+    - ... # other components
+    - name: DIETClassifier
+      intent_classification: True
+      entity_recognition: False
+      use_masked_language_model: False
+      BILOU_flag: False
+      number_of_transformer_layers: 0
+      ... # any other parameters
+
+  See :ref:`diet-classifier` for more information about the new component.
+
+- ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. ``DIETClassfier``
+  allows you to train one model for entity extraction and intent classification. However, if you want to
+  get the same model behaviour as the current ``CRFEntityExtractor``, you can use the following configuration:
+
+  .. code-block:: yaml
+
+    pipeline:
+    - ... # other components
+    - name: LexicalSyntacticFeaturizer
+      features: [
+        ["low", "title", "upper"],
+        [
+          "BOS",
+          "EOS",
+          "low",
+          "prefix5",
+          "prefix2",
+          "suffix5",
+          "suffix3",
+          "suffix2",
+          "upper",
+          "title",
+          "digit",
+        ],
+        ["low", "title", "upper"],
+      ]
+    - name: DIETClassifier
+      intent_classification: False
+      entity_recognition: True
+      use_masked_language_model: False
+      number_of_transformer_layers: 0
+      ... # any other parameters
+
+  As you can see in the configuration, you need to add the ``LexicalSyntacticFeaturizer`` before the ``DIETClassifier``
+  to your pipeline. ``CRFEntityExtractor`` featurizes user messages on its own, it does not depend on any featurizer.
+  We extracted the featurization from the component into the new featurizer ``LexicalSyntacticFeaturizer``. Thus,
+  in order to obtain the same results as before, you need to add this featurizer to your pipeline before the
+  ``DIETClassifier``. For more information about the ``DIETClassifier`` and the ``LexicalSyntacticFeaturizer``
+  see :ref:`components`.
 
 .. _migration-to-rasa-1.7:
 
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c21eae496866..e8385992e8f5 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -32,8 +32,8 @@ MitieNLP
 ~~~~~~~~
 
 :Short: MITIE initializer
-:Outputs: nothing
-:Requires: nothing
+:Outputs: Nothing
+:Requires: Nothing
 :Description:
     Initializes mitie structures. Every mitie component relies on this,
     hence this should be put at the beginning
@@ -58,8 +58,8 @@ SpacyNLP
 ~~~~~~~~
 
 :Short: spaCy language initializer
-:Outputs: nothing
-:Requires: nothing
+:Outputs: Nothing
+:Requires: Nothing
 :Description:
     Initializes spacy structures. Every spaCy component relies on this, hence this should be put at the beginning
     of every pipeline that uses any spaCy components.
@@ -107,7 +107,7 @@ WhitespaceTokenizer
 
 :Short: Tokenizer using whitespaces as a separator
 :Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
-:Requires: nothing
+:Requires: Nothing
 :Description:
     Creates a token for every whitespace separated character sequence.
 :Configuration:
@@ -131,7 +131,7 @@ JiebaTokenizer
 
 :Short: Tokenizer using Jieba for Chinese language
 :Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
-:Requires: nothing
+:Requires: Nothing
 :Description:
     Creates tokens using the Jieba tokenizer specifically for Chinese
     language. For language other than Chinese, Jieba will work as
@@ -199,7 +199,7 @@ ConveRTTokenizer
 
 :Short: Tokenizer using ConveRT
 :Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
-:Requires: nothing
+:Requires: Nothing
 :Description:
     Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
 :Configuration:
@@ -385,7 +385,7 @@ CountVectorsFeaturizer
         In this case during prediction all unknown words will be treated as this generic word ``OOV_token``.
 
         For example, one might create separate intent ``outofscope`` in the training data containing messages of
-        different number of ``OOV_token``s and maybe some additional general words.
+        different number of ``OOV_token`` s and maybe some additional general words.
         Then an algorithm will likely classify a message with unknown words as this intent ``outofscope``.
 
         .. note::
@@ -450,6 +450,7 @@ CountVectorsFeaturizer
           OOV_token: None  # string or None
           OOV_words: []  # list of strings
 
+.. _LexicalSyntacticFeaturizer:
 
 LexicalSyntacticFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -641,70 +642,71 @@ EmbeddingIntentClassifier
 
 :Configuration:
 
-    The algorithm has hyperparameters to control:
+    The following hyperparameters can be set:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_text`` sets a list of hidden layer sizes before
+            - ``hidden_layers_sizes.text`` sets a list of hidden layer sizes before
               the embedding layer for user inputs, the number of hidden layers
-              is equal to the length of the list
-            - ``hidden_layers_sizes_label`` sets a list of hidden layer sizes before
+              is equal to the length of the list.
+            - ``hidden_layers_sizes.label`` sets a list of hidden layer sizes before
               the embedding layer for intent labels, the number of hidden layers
-              is equal to the length of the list
-            - ``share_hidden_layers`` if set to True, shares the hidden layers between user inputs and intent label
+              is equal to the length of the list.
+            - ``share_hidden_layers`` if set to True, shares the hidden layers between user inputs and intent label.
 
         - training:
 
             - ``batch_size`` sets the number of training examples in one
               forward/backward pass, the higher the batch size, the more
-              memory space you'll need;
+              memory space you'll need.
             - ``batch_strategy`` sets the type of batching strategy,
-              it should be either ``sequence`` or ``balanced``;
+              it should be either ``sequence`` or ``balanced``.
             - ``epochs`` sets the number of times the algorithm will see
               training data, where one ``epoch`` equals one forward pass and
-              one backward pass of all the training examples;
-            - ``random_seed`` if set to any int will get reproducible
-              training results for the same inputs;
-            - ``learning_rate`` to set the learning rate of the optimizer
+              one backward pass of all the training examples.
+            - ``random_seed`` if set you will get reproducible
+              training results for the same inputs.
+            - ``learning_rate`` sets the initial learning rate of the optimizer.
 
         - embedding:
 
-            - ``dense_dimension`` sets the dense dimensions to use for sparse tensors if no dense features are present
-            - ``embedding_dimension`` sets the dimension of embedding space;
-            - ``number_of_negative_examples`` sets the number of incorrect intent labels,
-              the algorithm will minimize their similarity to the user
-              input during training;
+            - ``dense_dimension.text`` sets the dense dimensions for user inputs to use for sparse
+              tensors if no dense features are present.
+            - ``dense_dimension.label`` sets the dense dimensions for intent labels to use for sparse
+              tensors if no dense features are present.
+            - ``embedding_dimension`` sets the dimension of embedding space.
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels.
+              The algorithm will minimize their similarity to the user
+              input during training.
             - ``similarity_type`` sets the type of the similarity,
               it should be either ``auto``, ``cosine`` or ``inner``,
               if ``auto``, it will be set depending on ``loss_type``,
-              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``.
             - ``loss_type`` sets the type of the loss function,
-              it should be either ``softmax`` or ``margin``;
+              it should be either ``softmax`` or ``margin``.
             - ``ranking_length`` defines the number of top confidences over
-              which to normalize ranking results if ``loss_type: "softmax"``;
-              to turn off normalization set it to 0
+              which to normalize ranking results if ``loss_type: "softmax"``.
+              To turn off normalization set it to 0.
             - ``maximum_positive_similarity`` controls how similar the algorithm should try
               to make embedding vectors for correct intent labels,
-              used only if ``loss_type`` is set to ``margin``;
+              used only if ``loss_type`` is set to ``margin``.
             - ``maximum_negative_similarity`` controls maximum negative similarity for
-              incorrect intents,
-              used only if ``loss_type`` is set to ``margin``;
+              incorrect intents, used only if ``loss_type`` is set to ``margin``.
             - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
               minimizes maximum similarity over incorrect intent labels,
-              used only if ``loss_type`` is set to ``margin``;
+              used only if ``loss_type`` is set to ``margin``.
             - ``scale_loss`` if ``true`` the algorithm will downscale the loss
               for examples where correct label is predicted with high confidence,
-              used only if ``loss_type`` is set to ``softmax``;
+              used only if ``loss_type`` is set to ``softmax``.
 
         - regularization:
 
-            - ``l2_regularization`` sets the scale of L2 regularization
-            - ``C_emb`` sets the scale of how important is to minimize
-              the maximum similarity between embeddings of different intent labels;
+            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``negative_margin_scale`` sets the scale of how important is to minimize
+              the maximum similarity between embeddings of different intent labels.
             - ``droprate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1``
-              would drop out ``10%`` of input units;
-            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not
+              between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
+            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
     .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
               be between ``-1`` and ``1``.
@@ -714,19 +716,75 @@ EmbeddingIntentClassifier
               In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
               If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
 
-    In the config, you can specify these parameters.
-    The default values are defined in ``EmbeddingIntentClassifier.defaults``:
-
-    .. literalinclude:: ../../rasa/nlu/classifiers/embedding_intent_classifier.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
-
     .. note:: Parameter ``maximum_negative_similarity`` is set to a negative value to mimic the original
-              starspace algorithm in the case ``maximum_negative_similarity = maximum_positive_similarity`` and
-              ``use_maximum_negative_similarity = False``.
+              starspace algorithm in the case ``maximum_negative_similarity = maximum_positive_similarity``
+              and ``use_maximum_negative_similarity = False``.
               See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
 
+    Default values:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "EmbeddingIntentClassifier"
+            # nn architecture
+            # sizes of hidden layers before the embedding layer
+            # for input words and intent labels,
+            # the number of hidden layers is thus equal to the length of this list
+            "hidden_layers_sizes": {"text": [256, 128], "label": []}
+            # Whether to share the hidden layer weights between input words and labels
+            "share_hidden_layers": False
+            # training parameters
+            # initial and final batch sizes - batch size will be
+            # linearly increased for each epoch
+            "batch_size": [64, 256]
+            # how to create batches
+            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
+            # number of epochs
+            "epochs": 300
+            # set random seed to any int to get reproducible results
+            "random_seed": None
+            # optimizer
+            "learning_rate": 0.001
+            # embedding parameters
+            # default dense dimension used if no dense features are present
+            "dense_dimension": {"text": 512, "label": 20}
+            # dimension size of embedding vectors
+            "embedding_dimension": 20
+            # the type of the similarity
+            "number_of_negative_examples": 20
+            # flag if minimize only maximum similarity over incorrect actions
+            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
+            # the type of the loss function
+            "loss_type": "softmax"  # string 'softmax' or 'margin'
+            # number of top intents to normalize scores for softmax loss_type
+            # set to 0 to turn off normalization
+            "ranking_length": 10
+            # how similar the algorithm should try
+            # to make embedding vectors for correct labels
+            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
+            # maximum negative similarity for incorrect labels
+            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
+            # flag: if true, only minimize the maximum similarity for incorrect labels
+            "use_maximum_negative_similarity": True
+            # scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # regularization parameters
+            # the scale of regularization
+            "regularization_constant": 0.002
+            # the scale of how critical the algorithm should be of minimizing the
+            # maximum similarity between embeddings of different labels
+            "negative_margin_scale": 0.8
+            # dropout rate for rnn
+            "droprate": 0.2
+            # if true apply dropout to sparse tensors
+            "use_sparse_input_dropout": False
+            # visualization of accuracy
+            # how often to calculate training accuracy
+            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
+            # how many examples to use for calculation of training accuracy
+            "evaluate_on_number_of_examples": 0  # large values may hurt performance
+
 .. _keyword_intent_classifier:
 
 KeywordIntentClassifier
@@ -734,7 +792,7 @@ KeywordIntentClassifier
 
 :Short: Simple keyword matching intent classifier, intended for small, short-term projects.
 :Outputs: ``intent``
-:Requires: nothing
+:Requires: Nothing
 
 :Output-Example:
 
@@ -768,8 +826,8 @@ Selectors
 
 .. _response-selector:
 
-Response Selector
-~~~~~~~~~~~~~~~~~~
+ResponseSelector
+~~~~~~~~~~~~~~~~
 
 :Short: Response Selector
 :Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
@@ -805,23 +863,104 @@ Response Selector
 :Configuration:
 
     The algorithm includes all the hyperparameters that ``DIETClassifier`` uses.
-    In addition, the component can also be configured to train a response selector for a particular retrieval intent
+    In addition, the component can also be configured to train a response selector for a particular retrieval intent.
 
-        - ``retrieval_intent``: sets the name of the intent for which this response selector model is trained.
-          Default ``None``
+        - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
 
-    In the config, you can specify these parameters.
-    The default values are defined in ``ResponseSelector.defaults``:
+    Default values:
 
-    .. literalinclude:: ../../rasa/nlu/selectors/response_selector.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "ResponseSelector"
+            # nn architecture
+            # sizes of hidden layers before the embedding layer
+            # for input words and intent labels,
+            # the number of hidden layers is thus equal to the length of this list
+            "hidden_layers_sizes": {"text": [], "label": []}
+            # Whether to share the hidden layer weights between input words and labels
+            "share_hidden_layers": False
+            # number of units in transformer
+            "transformer_size": 256
+            # number of transformer layers
+            "number_of_transformer_layers": 2
+            # number of attention heads in transformer
+            "number_of_attention_heads": 4
+            # max sequence length
+            "maximum_sequence_length": 256
+            # use a unidirectional or bidirectional encoder
+            "unidirectional_encoder": False
+            # if true use key relative embeddings in attention
+            "use_key_relative_attention": False
+            # if true use key relative embeddings in attention
+            "use_value_relative_attention": False
+            # max position for relative embeddings
+            "max_relative_position": None
+            # training parameters
+            # initial and final batch sizes - batch size will be
+            # linearly increased for each epoch
+            "batch_size": [64, 256]
+            # how to create batches
+            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
+            # number of epochs
+            "epochs": 300
+            # set random seed to any int to get reproducible results
+            "random_seed": None
+            # optimizer
+            "learning_rate": 0.001
+            # embedding parameters
+            # default dense dimension used if no dense features are present
+            "dense_dimension": {"text": 512, "label": 512}
+            # dimension size of embedding vectors
+            "embedding_dimension": 20
+            # the type of the similarity
+            "number_of_negative_examples": 20
+            # flag if minimize only maximum similarity over incorrect actions
+            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
+            # the type of the loss function
+            "loss_type": "softmax"  # string 'softmax' or 'margin'
+            # number of top intents to normalize scores for softmax loss_type
+            # set to 0 to turn off normalization
+            "ranking_length": 10
+            # how similar the algorithm should try
+            # to make embedding vectors for correct labels
+            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
+            # maximum negative similarity for incorrect labels
+            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
+            # flag: if true, only minimize the maximum similarity for incorrect labels
+            "use_maximum_negative_similarity": True
+            # scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # regularization parameters
+            # the scale of regularization
+            "regularization_constant": 0.002
+            # the scale of how critical the algorithm should be of minimizing the
+            # maximum similarity between embeddings of different labels
+            "negative_margin_scale": 0.8
+            # dropout rate for rnn
+            "droprate": 0.2
+            # dropout rate for attention
+            "droprate_attention": 0
+            # if true apply dropout to sparse tensors
+            "use_sparse_input_dropout": True
+            # visualization of accuracy
+            # how often to calculate training accuracy
+            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
+            # how many examples to use for calculation of training accuracy
+            "evaluate_on_number_of_examples": 0  # large values may hurt performance
+            # if true random tokens of the input message will be masked and the model
+            # should predict those tokens
+            "use_masked_language_model": False
+            # selector config
+            # name of the intent for which this response selector is to be trained
+            "retrieval_intent": None
 
 
 Entity Extractors
 -----------------
 
+Entity extractors extract entities, such as person names or locations, from the user input.
+
 MitieEntityExtractor
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -893,19 +1032,18 @@ SpacyEntityExtractor
         pipeline:
         - name: "SpacyEntityExtractor"
           # dimensions to extract
-          dimensions: ["PERSON", "LOC", "ORG", "PRODUCT"]
+          dimensions: None
 
 
 EntitySynonymMapper
 ~~~~~~~~~~~~~~~~~~~
 
 :Short: Maps synonymous entity values to the same value.
-:Outputs: modifies existing entities that previous entity extraction components found
-:Requires: nothing
+:Outputs: Modifies existing entities that previous entity extraction components found.
+:Requires: Nothing
 :Description:
-    If the training data contains defined synonyms (by using the ``value`` attribute on the entity examples).
-    this component will make sure that detected entity values will be mapped to the same value. For example,
-    if your training data contains the following examples:
+    If the training data contains defined synonyms, this component will make sure that detected entity values will
+    be mapped to the same value. For example, if your training data contains the following examples:
 
     .. code-block:: json
 
@@ -946,7 +1084,7 @@ EntitySynonymMapper
 CRFEntityExtractor
 ~~~~~~~~~~~~~~~~~~
 
-:Short: CRF (conditional random field) entity extraction
+:Short: Conditional random field (CRF) entity extraction
 :Outputs: ``entities``
 :Requires: ``tokens`` and ``dense_features`` (optional)
 :Output-Example:
@@ -965,51 +1103,125 @@ CRFEntityExtractor
         }
 
 :Description:
-    This component implements conditional random fields to do named entity recognition.
+    This component implements a conditional random fields (CRF) to do named entity recognition.
     CRFs can be thought of as an undirected Markov chain where the time steps are words
     and the states are entity classes. Features of the words (capitalisation, POS tagging,
     etc.) give probabilities to certain entity classes, as are transitions between
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
-    If POS features are used (pos or pos2), spaCy has to be installed. If you want to use
-    additional features, such as pre-trained word embeddings, from any provided dense
-    featurizer, use ``"text_dense_features"``.
 
     .. warning::
         ``CRFEntityExtractor`` is deprecated and should be replaced by ``DIETClassifier``. See
         `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
 
 :Configuration:
-   .. code-block:: yaml
+
+    The following hyperparameters can be set:
+
+        - neural network's architecture:
+
+            - ``hidden_layers_sizes.text`` sets a list of hidden layer sizes before
+              the embedding layer for user inputs, the number of hidden layers
+              is equal to the length of the list.
+
+        - training:
+
+            - ``batch_size`` sets the number of training examples in one
+              forward/backward pass, the higher the batch size, the more
+              memory space you'll need.
+            - ``epochs`` sets the number of times the algorithm will see
+              training data, where one ``epoch`` equals one forward pass and
+              one backward pass of all the training examples.
+            - ``random_seed`` if set you will get reproducible
+              training results for the same inputs.
+            - ``learning_rate`` sets the initial learning rate of the optimizer.
+
+        - embedding:
+
+            - ``dense_dimension.text`` sets the dense dimensions for user inputs to use for sparse
+              tensors if no dense features are present.
+
+        - regularization:
+
+            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``droprate`` sets the dropout rate, it should be
+              between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
+            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
+
+        - model configuration:
+
+            - ``features`` indicates what features to use. ``CRFEntityExtractor`` is using the same featurization
+              as ``LexicalSyntacticFeaturizer``. See :ref:`LexicalSyntacticFeaturizer` for details on what kind
+              of features are available.
+            - ``BILOU_flag`` determines whether to use BILOU tagging or not.
+
+    .. note:: There is an option to use linearly increasing batch size. The idea comes from
+              `<https://arxiv.org/abs/1711.00489>`_.
+              In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
+              If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
+
+
+    Default values:
+
+    .. code-block:: yaml
 
         pipeline:
         - name: "CRFEntityExtractor"
-          # The features are a ``[before, word, after]`` array with
-          # before, word, after holding keys about which
-          # features to use for each word, for example, ``"title"``
-          # in array before will have the feature
-          # "is the preceding word in title case?".
-          # Available features are:
-          # ``low``, ``title``, ``suffix5``, ``suffix3``, ``suffix2``,
-          # ``suffix1``, ``pos``, ``pos2``, ``prefix5``, ``prefix2``,
-          # ``bias``, ``upper``, ``digit``, ``pattern``, and ``text_dense_features``
-          features: [["low", "title"], ["bias", "suffix3"], ["upper", "pos", "pos2"]]
-
-          # The flag determines whether to use BILOU tagging or not. BILOU
-          # tagging is more rigorous however
-          # requires more examples per entity. Rule of thumb: use only
-          # if more than 100 examples per entity.
-          BILOU_flag: true
-
-          # This is the value given to sklearn_crfcuite.CRF tagger before training.
-          max_iterations: 50
-
-          # This is the value given to sklearn_crfcuite.CRF tagger before training.
-          # Specifies the L1 regularization coefficient.
-          L1_c: 0.1
-
-          # This is the value given to sklearn_crfcuite.CRF tagger before training.
-          # Specifies the L2 regularization coefficient.
-          L2_c: 0.1
+            # 'features' is [before, word, after] array with before, word,
+            # after holding keys about which features to use for each word,
+            # for example, 'title' in array before will have the feature
+            # "is the preceding word in title case?"
+            # POS features require 'SpacyTokenizer'.
+            "features": [
+                ["low", "title", "upper"],
+                [
+                    "BOS",
+                    "EOS",
+                    "low",
+                    "prefix5",
+                    "prefix2",
+                    "suffix5",
+                    "suffix3",
+                    "suffix2",
+                    "upper",
+                    "title",
+                    "digit",
+                ],
+                ["low", "title", "upper"],
+            ]
+            # nn architecture
+            # sizes of hidden layers before the embedding layer
+            # for input words and intent labels,
+            # the number of hidden layers is thus equal to the length of this list
+            "hidden_layers_sizes": {"text": [256, 128]}
+            # training parameters
+            # initial and final batch sizes - batch size will be
+            # linearly increased for each epoch
+            "batch_size": [64, 256]
+            # number of epochs
+            "epochs": 300
+            # set random seed to any int to get reproducible results
+            "random_seed": None
+            # optimizer
+            "learning_rate": 0.001
+            # embedding parameters
+            # default dense dimension used if no dense features are present
+            "dense_dimension": {"text": 512}
+            # regularization parameters
+            # the scale of regularization
+            "regularization_constant": 0.002
+            # dropout rate for rnn
+            "droprate": 0.2
+            # if true apply dropout to sparse tensors
+            "use_sparse_input_dropout": True
+            # visualization of accuracy
+            # how often to calculate training accuracy
+            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
+            # how many examples to use for calculation of training accuracy
+            "evaluate_on_number_of_examples": 0  # large values may hurt performance
+            # BILOU_flag determines whether to use BILOU tagging or not.
+            # More rigorous however requires more examples per entity
+            # rule of thumb: use only if more than 100 egs. per entity
+            "BILOU_flag": False
 
 .. _DucklingHTTPExtractor:
 
@@ -1018,19 +1230,21 @@ DucklingHTTPExtractor
 
 :Short: Duckling lets you extract common entities like dates,
         amounts of money, distances, and others in a number of languages.
-:Outputs: appends ``entities``
-:Requires: nothing
+:Outputs: ``entities``
+:Requires: Nothing
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"end": 53,
-                          "entity": "time",
-                          "start": 48,
-                          "value": "2017-04-10T00:00:00.000+02:00",
-                          "confidence": 1.0,
-                          "extractor": "DucklingHTTPExtractor"}]
+            "entities": [{
+                "end": 53,
+                "entity": "time",
+                "start": 48,
+                "value": "2017-04-10T00:00:00.000+02:00",
+                "confidence": 1.0,
+                "extractor": "DucklingHTTPExtractor"
+            }]
         }
 
 :Description:
@@ -1076,8 +1290,10 @@ DucklingHTTPExtractor
           timeout : 3
 
 
-Combined Entity Extraction and Intent Classification
-----------------------------------------------------
+Combined Entity Extractors and Intent Classifiers
+-------------------------------------------------
+
+.. _diet-classifier:
 
 DIETClassifier
 ~~~~~~~~~~~~~~
@@ -1114,90 +1330,198 @@ DIETClassifier
 :Description:
     TODO
 
-    .. note:: If during prediction time a message contains **only** words unseen during training,
+    .. note:: If during prediction time a message contains **only** words unseen during training
               and no Out-Of-Vacabulary preprocessor was used,
-              empty intent ``None`` is predicted with confidence ``0.0``.
+              an empty intent ``None`` is predicted with confidence ``0.0``.
 
 :Configuration:
 
-    The algorithm also has hyperparameters to control:
+    The following hyperparameters can be set:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_a`` sets a list of hidden layer sizes before
+            - ``hidden_layers_sizes.text`` sets a list of hidden layer sizes before
               the embedding layer for user inputs, the number of hidden layers
-              is equal to the length of the list
-            - ``hidden_layers_sizes_b`` sets a list of hidden layer sizes before
+              is equal to the length of the list.
+            - ``hidden_layers_sizes.label`` sets a list of hidden layer sizes before
               the embedding layer for intent labels, the number of hidden layers
-              is equal to the length of the list
-            - ``share_hidden`` if set to True, shares the hidden layers between user inputs and intent label
+              is equal to the length of the list.
+            - ``share_hidden_layers`` if set to True, shares the hidden layers between user inputs and intent label.
+            - ``transformer_size`` sets the size of the transformer.
+            - ``number_of_transformer_layers`` sets the number of transformer layers to use.
+            - ``number_of_attention_heads`` sets the number of attention heads to use.
+            - ``maximum_sequence_length`` sets the maximum length of sequence.
+            - ``unidirectional_encoder`` specifies whether to use a unidirectional or bidirectional encoder.
+            - ``use_key_relative_attention`` if true use key relative embeddings in attention.
+            - ``use_value_relative_attention`` if true use key relative embeddings in attention.
+            - ``max_relative_position`` sets the max position for relative embeddings.
 
         - training:
 
             - ``batch_size`` sets the number of training examples in one
               forward/backward pass, the higher the batch size, the more
-              memory space you'll need;
+              memory space you'll need.
             - ``batch_strategy`` sets the type of batching strategy,
-              it should be either ``sequence`` or ``balanced``;
+              it should be either ``sequence`` or ``balanced``.
             - ``epochs`` sets the number of times the algorithm will see
               training data, where one ``epoch`` equals one forward pass and
-              one backward pass of all the training examples;
-            - ``random_seed`` if set to any int will get reproducible
-              training results for the same inputs;
+              one backward pass of all the training examples.
+            - ``random_seed`` if set you will get reproducible
+              training results for the same inputs.
+            - ``learning_rate`` sets the initial learning rate of the optimizer.
 
         - embedding:
 
-            - ``embed_dim`` sets the dimension of embedding space;
-            - ``num_neg`` sets the number of incorrect intent labels,
-              the algorithm will minimize their similarity to the user
-              input during training;
+            - ``dense_dimension.text`` sets the dense dimensions for user inputs to use for sparse
+              tensors if no dense features are present.
+            - ``dense_dimension.label`` sets the dense dimensions for intent labels to use for sparse
+              tensors if no dense features are present.
+            - ``embedding_dimension`` sets the dimension of embedding space.
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels.
+              The algorithm will minimize their similarity to the user
+              input during training.
             - ``similarity_type`` sets the type of the similarity,
               it should be either ``auto``, ``cosine`` or ``inner``,
               if ``auto``, it will be set depending on ``loss_type``,
-              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``.
             - ``loss_type`` sets the type of the loss function,
-              it should be either ``softmax`` or ``margin``;
+              it should be either ``softmax`` or ``margin``.
             - ``ranking_length`` defines the number of top confidences over
-              which to normalize ranking results if ``loss_type: "softmax"``;
-              to turn off normalization set it to 0
-            - ``mu_pos`` controls how similar the algorithm should try
+              which to normalize ranking results if ``loss_type: "softmax"``.
+              To turn off normalization set it to 0.
+            - ``maximum_positive_similarity`` controls how similar the algorithm should try
               to make embedding vectors for correct intent labels,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``mu_neg`` controls maximum negative similarity for
-              incorrect intents,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``use_max_sim_neg`` if ``true`` the algorithm only
+              used only if ``loss_type`` is set to ``margin``.
+            - ``maximum_negative_similarity`` controls maximum negative similarity for
+              incorrect intents, used only if ``loss_type`` is set to ``margin``.
+            - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
               minimizes maximum similarity over incorrect intent labels,
-              used only if ``loss_type`` is set to ``margin``;
+              used only if ``loss_type`` is set to ``margin``.
             - ``scale_loss`` if ``true`` the algorithm will downscale the loss
               for examples where correct label is predicted with high confidence,
-              used only if ``loss_type`` is set to ``softmax``;
+              used only if ``loss_type`` is set to ``softmax``.
 
         - regularization:
 
-            - ``C2`` sets the scale of L2 regularization
-            - ``C_emb`` sets the scale of how important is to minimize
-              the maximum similarity between embeddings of different intent labels;
+            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``negative_margin_scale`` sets the scale of how important is to minimize
+              the maximum similarity between embeddings of different intent labels.
             - ``droprate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1``
-              would drop out ``10%`` of input units;
+              between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
+            - ``droprate_attention`` sets the dropout rate for attention, it should be
+              between ``0`` and ``1``, e.g. ``droprate_attention=0.1`` would drop out ``10%`` of input units.
+            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
+
+        - model configuration:
 
-    .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
+            - ``use_masked_language_model`` specifies whether to apply masking or not.
+            - ``intent_classification`` indicates whether intent classification should be performed or not.
+            - ``entity_recognition`` indicates whether entity recognition should be performed or not.
+            - ``BILOU_flag`` determines whether to use BILOU tagging or not.
+
+    .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
+              be between ``-1`` and ``1``.
 
     .. note:: There is an option to use linearly increasing batch size. The idea comes from
               `<https://arxiv.org/abs/1711.00489>`_.
               In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
               If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
 
-    In the config, you can specify these parameters.
-    The default values are defined in ``DIETClassifier.defaults``:
+    .. note:: Parameter ``maximum_negative_similarity`` is set to a negative value to mimic the original
+              starspace algorithm in the case ``maximum_negative_similarity = maximum_positive_similarity``
+              and ``use_maximum_negative_similarity = False``.
+              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
 
-    .. literalinclude:: ../../rasa/nlu/classifiers/diet_classifier.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
+    Default values:
 
-    .. note:: Parameter ``mu_neg`` is set to a negative value to mimic the original
-              starspace algorithm in the case ``mu_neg = mu_pos`` and ``use_max_sim_neg = False``.
-              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
+    .. code-block:: yaml
 
+        pipeline:
+        - name: "DIETClassifier"
+            # nn architecture
+            # sizes of hidden layers before the embedding layer
+            # for input words and intent labels,
+            # the number of hidden layers is thus equal to the length of this list
+            "hidden_layers_sizes": {"text": [], "label": []}
+            # Whether to share the hidden layer weights between input words and labels
+            "share_hidden_layers": False
+            # number of units in transformer
+            "transformer_size": 256
+            # number of transformer layers
+            "number_of_transformer_layers": 2
+            # number of attention heads in transformer
+            "number_of_attention_heads": 4
+            # max sequence length
+            "maximum_sequence_length": 256
+            # use a unidirectional or bidirectional encoder
+            "unidirectional_encoder": False
+            # if true use key relative embeddings in attention
+            "use_key_relative_attention": False
+            # if true use key relative embeddings in attention
+            "use_value_relative_attention": False
+            # max position for relative embeddings
+            "max_relative_position": None
+            # training parameters
+            # initial and final batch sizes - batch size will be
+            # linearly increased for each epoch
+            "batch_size": [64, 256]
+            # how to create batches
+            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
+            # number of epochs
+            "epochs": 300
+            # set random seed to any int to get reproducible results
+            "random_seed": None
+            # optimizer
+            "learning_rate": 0.001
+            # embedding parameters
+            # default dense dimension used if no dense features are present
+            "dense_dimension": {"text": 512, "label": 20}
+            # dimension size of embedding vectors
+            "embedding_dimension": 20
+            # the type of the similarity
+            "number_of_negative_examples": 20
+            # flag if minimize only maximum similarity over incorrect actions
+            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
+            # the type of the loss function
+            "loss_type": "softmax"  # string 'softmax' or 'margin'
+            # number of top intents to normalize scores for softmax loss_type
+            # set to 0 to turn off normalization
+            "ranking_length": 10
+            # how similar the algorithm should try
+            # to make embedding vectors for correct labels
+            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
+            # maximum negative similarity for incorrect labels
+            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
+            # flag: if true, only minimize the maximum similarity for incorrect labels
+            "use_maximum_negative_similarity": True
+            # scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # regularization parameters
+            # the scale of regularization
+            "regularization_constant": 0.002
+            # the scale of how critical the algorithm should be of minimizing the
+            # maximum similarity between embeddings of different labels
+            "negative_margin_scale": 0.8
+            # dropout rate for rnn
+            "droprate": 0.2
+            # dropout rate for attention
+            "droprate_attention": 0
+            # if true apply dropout to sparse tensors
+            "use_sparse_input_dropout": True
+            # visualization of accuracy
+            # how often to calculate training accuracy
+            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
+            # how many examples to use for calculation of training accuracy
+            "evaluate_on_number_of_examples": 0  # large values may hurt performance
+            # model config
+            # if true intent classification is trained and intent predicted
+            "intent_classification": True
+            # if true named entity recognition is trained and entities predicted
+            "entity_recognition": True
+            # if true random tokens of the input message will be masked and the model
+            # should predict those tokens
+            "use_masked_language_model": False
+            # BILOU_flag determines whether to use BILOU tagging or not.
+            # More rigorous however requires more examples per entity
+            # rule of thumb: use only if more than 100 egs. per entity
+            "BILOU_flag": True
\ No newline at end of file
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index e263553a1cc3..6b983a393585 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -25,9 +25,9 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
     EMBED_DIM,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
@@ -86,12 +86,12 @@ class EmbeddingPolicy(TEDPolicy):
         RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
-        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect labels
-        MU_NEG: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MAX_NEG_SIM: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
         # the number of incorrect labels, the algorithm will minimize
         # their similarity to the user input during training
-        USE_MAX_SIM_NEG: True,  # flag which loss function to use
+        USE_MAX_NEG_SIM: True,  # flag which loss function to use
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 59f0951ebf9b..9d65e00901e7 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -45,9 +45,9 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
     EMBED_DIM,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
@@ -79,10 +79,16 @@ class TEDPolicy(Policy):
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
-        # max sequence length if pos_encoding='emb'
+        # max sequence length
         MAX_SEQ_LENGTH: 256,
         # number of attention heads in transformer
         NUM_HEADS: 4,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
         # training parameters
         # initial and final batch sizes:
         # batch size will be linearly increased for each epoch
@@ -107,12 +113,12 @@ class TEDPolicy(Policy):
         RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
-        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect labels
-        MU_NEG: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MAX_NEG_SIM: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
         # the number of incorrect labels, the algorithm will minimize
         # their similarity to the user input during training
-        USE_MAX_SIM_NEG: True,  # flag which loss function to use
+        USE_MAX_NEG_SIM: True,  # flag which loss function to use
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization
@@ -132,12 +138,6 @@ class TEDPolicy(Policy):
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for hold out validation set
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -502,9 +502,9 @@ def _prepare_layers(self) -> None:
         self._tf_layers["loss.label"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
-            self.config[MU_POS],
-            self.config[MU_NEG],
-            self.config[USE_MAX_SIM_NEG],
+            self.config[MAX_POS_SIM],
+            self.config[MAX_NEG_SIM],
+            self.config[USE_MAX_NEG_SIM],
             self.config[NEG_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 88762df951ee..f0af76a3c044 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -64,9 +64,9 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
     EMBED_DIM,
     BILOU_FLAG,
     KEY_RELATIVE_ATTENTION,
@@ -116,8 +116,16 @@ class DIETClassifier(EntityExtractor):
         NUM_TRANSFORMER_LAYERS: 2,
         # number of attention heads in transformer
         NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # max sequence length
         MAX_SEQ_LENGTH: 256,
+        # use a unidirectional or bidirectional encoder
+        UNIDIRECTIONAL_ENCODER: False,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -146,11 +154,11 @@ class DIETClassifier(EntityExtractor):
         RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
-        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect labels
-        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
         # flag: if true, only minimize the maximum similarity for incorrect labels
-        USE_MAX_SIM_NEG: True,
+        USE_MAX_NEG_SIM: True,
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization parameters
@@ -163,8 +171,6 @@ class DIETClassifier(EntityExtractor):
         DROPRATE: 0.2,
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
-        # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -180,12 +186,6 @@ class DIETClassifier(EntityExtractor):
         # if true random tokens of the input message will be masked and the model
         # should predict those tokens
         MASKED_LM: False,
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
@@ -1081,9 +1081,9 @@ def _prepare_mask_lm_layers(self, name: Text) -> None:
         self._tf_layers[f"loss.{name}_mask"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
-            self.config[MU_POS],
-            self.config[MU_NEG],
-            self.config[USE_MAX_SIM_NEG],
+            self.config[MAX_POS_SIM],
+            self.config[MAX_NEG_SIM],
+            self.config[USE_MAX_NEG_SIM],
             self.config[NEG_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
@@ -1106,9 +1106,9 @@ def _prepare_label_classification_layers(self) -> None:
         self._tf_layers["loss.label"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
-            self.config[MU_POS],
-            self.config[MU_NEG],
-            self.config[USE_MAX_SIM_NEG],
+            self.config[MAX_POS_SIM],
+            self.config[MAX_NEG_SIM],
+            self.config[USE_MAX_NEG_SIM],
             self.config[NEG_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 1828a65f5cc8..eb84a77713e5 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -30,9 +30,9 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
     EMBED_DIM,
     BILOU_FLAG,
 )
@@ -48,7 +48,7 @@ class EmbeddingIntentClassifier(DIETClassifier):
 
     requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
 
-    # default properties (DOC MARKER - don't remove)
+    # please make sure to update the docs when changing a default parameter
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
@@ -85,11 +85,11 @@ class EmbeddingIntentClassifier(DIETClassifier):
         RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct labels
-        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect labels
-        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
         # flag: if true, only minimize the maximum similarity for incorrect labels
-        USE_MAX_SIM_NEG: True,
+        USE_MAX_NEG_SIM: True,
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization parameters
@@ -108,7 +108,6 @@ class EmbeddingIntentClassifier(DIETClassifier):
         # how many examples to use for calculation of training accuracy
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 49c4df99d067..7e309268a33d 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -16,7 +16,6 @@
     HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
     BATCH_SIZES,
-    BATCH_STRATEGY,
     EPOCHS,
     RANDOM_SEED,
     LEARNING_RATE,
@@ -30,8 +29,6 @@
     DROPRATE,
     REGULARIZATION_CONSTANT,
     BILOU_FLAG,
-    SHARE_HIDDEN_LAYERS,
-    UNIDIRECTIONAL_ENCODER,
 )
 from rasa.utils.common import raise_warning
 from rasa.utils.tensorflow.models import RasaModel
@@ -45,7 +42,7 @@ class CRFEntityExtractor(DIETClassifier):
 
     requires = [TOKENS_NAMES[TEXT]]
 
-    # default properties (DOC MARKER - don't remove)
+    # please make sure to update the docs when changing a default parameter
     defaults = {
         # 'features' is [before, word, after] array with before, word,
         # after holding keys about which features to use for each word,
@@ -77,8 +74,6 @@ class CRFEntityExtractor(DIETClassifier):
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
         BATCH_SIZES: [64, 256],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
         # number of epochs
         EPOCHS: 300,
         # set random seed to any int to get reproducible results
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 25123eb3110b..ce1f4f3e0b59 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -38,9 +38,9 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
     EMBED_DIM,
     BILOU_FLAG,
     KEY_RELATIVE_ATTENTION,
@@ -88,7 +88,7 @@ class ResponseSelector(DIETClassifier):
         any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
     ]
 
-    # default properties (DOC MARKER - don't remove)
+    # please make sure to update the docs when changing a default parameter
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
@@ -103,8 +103,16 @@ class ResponseSelector(DIETClassifier):
         NUM_TRANSFORMER_LAYERS: 2,
         # number of attention heads in transformer
         NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
+        # max sequence length
         MAX_SEQ_LENGTH: 256,
+        # use a unidirectional or bidirectional encoder
+        UNIDIRECTIONAL_ENCODER: False,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -133,12 +141,12 @@ class ResponseSelector(DIETClassifier):
         RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct intent labels
-        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect intent labels
-        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
         # flag: if true, only minimize the maximum similarity for
         # incorrect intent labels
-        USE_MAX_SIM_NEG: True,
+        USE_MAX_NEG_SIM: True,
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization parameters
@@ -151,8 +159,6 @@ class ResponseSelector(DIETClassifier):
         DROPRATE: 0.2,
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
-        # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -163,17 +169,10 @@ class ResponseSelector(DIETClassifier):
         # if true random tokens of the input message will be masked and the model
         # should predict those tokens
         MASKED_LM: False,
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 2f27695e762b..bbeef0c6539a 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -8,6 +8,9 @@
 NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
 NUM_HEADS = "number_of_attention_heads"
 UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
+KEY_RELATIVE_ATTENTION = "use_key_relative_attention"
+VALUE_RELATIVE_ATTENTION = "use_value_relative_attention"
+MAX_RELATIVE_POSITION = "max_relative_position"
 
 MAX_SEQ_LENGTH = "maximum_sequence_length"
 
@@ -23,13 +26,13 @@
 SIMILARITY_TYPE = "similarity_type"
 LOSS_TYPE = "loss_type"
 NUM_NEG = "number_of_negative_examples"
-MU_POS = "maximum_positive_similarity"
-MU_NEG = "maximum_negative_similarity"
-USE_MAX_SIM_NEG = "use_maximum_negative_similarity"
+MAX_POS_SIM = "maximum_positive_similarity"
+MAX_NEG_SIM = "maximum_negative_similarity"
+USE_MAX_NEG_SIM = "use_maximum_negative_similarity"
 
 SCALE_LOSS = "scale_loss"
 REGULARIZATION_CONSTANT = "regularization_constant"
-NEG_MARGIN_SCALE = "neg_margin_scale"
+NEG_MARGIN_SCALE = "negative_margin_scale"
 DROPRATE = "droprate"
 DROPRATE_ATTENTION = "droprate_attention"
 DROPRATE_DIALOGUE = "droprate_dialogue"
@@ -47,7 +50,3 @@
 RANKING_LENGTH = "ranking_length"
 
 BILOU_FLAG = "BILOU_flag"
-
-KEY_RELATIVE_ATTENTION = "use_key_relative_attention"
-VALUE_RELATIVE_ATTENTION = "use_value_relative_attention"
-MAX_RELATIVE_POSITION = "max_relative_position"
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 41d186c45365..639ab65cfc7e 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -18,12 +18,13 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     REGULARIZATION_CONSTANT,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
     EMBED_DIM,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
+    NEG_MARGIN_SCALE,
 )
 
 
@@ -112,10 +113,11 @@ def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
     config = _replace_deprecated_option("dense_dim", DENSE_DIM, config)
     config = _replace_deprecated_option("embed_dim", EMBED_DIM, config)
     config = _replace_deprecated_option("num_neg", NUM_NEG, config)
-    config = _replace_deprecated_option("mu_pos", MU_POS, config)
-    config = _replace_deprecated_option("mu_neg", MU_NEG, config)
-    config = _replace_deprecated_option("use_max_sim_neg", USE_MAX_SIM_NEG, config)
+    config = _replace_deprecated_option("mu_pos", MAX_POS_SIM, config)
+    config = _replace_deprecated_option("mu_neg", MAX_NEG_SIM, config)
+    config = _replace_deprecated_option("use_max_sim_neg", USE_MAX_NEG_SIM, config)
     config = _replace_deprecated_option("C2", REGULARIZATION_CONSTANT, config)
+    config = _replace_deprecated_option("C_emb", NEG_MARGIN_SCALE, config)
     config = _replace_deprecated_option(
         "evaluate_every_num_epochs", EVAL_NUM_EPOCHS, config
     )

From c0cb91ae81ad2fd2cd362a9fb24ec3095e71c098 Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Thu, 13 Feb 2020 14:14:47 +0100
Subject: [PATCH 381/633] changes to avoid OSError in convert tokenizer and
 featurizer; added tqdm for ConveRT featurizer

---
 .../dense_featurizer/convert_featurizer.py            | 11 ++++++++---
 rasa/nlu/tokenizers/convert_tokenizer.py              |  7 ++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 95266212e305..5c0d65b90049 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,5 +1,6 @@
 import logging
 from typing import Any, Dict, List, Optional, Text, Tuple
+from tqdm import tqdm
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -33,10 +34,14 @@ def _load_model(self) -> None:
         # needed in order to load model
         import tensorflow_text
         import tensorflow_hub as tfhub
+        import os
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-
-        self.module = tfhub.load(model_url)
+        try:
+            self.module = tfhub.load(model_url)
+        except OSError:
+            os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'
+            self.module = tfhub.load(model_url)
 
         self.sentence_encoding_signature = self.module.signatures["default"]
         self.sequence_encoding_signature = self.module.signatures["encode_sequence"]
@@ -180,7 +185,7 @@ def train(
 
         batch_size = 64
 
-        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
+        for attribute in tqdm(DENSE_FEATURIZABLE_ATTRIBUTES):
 
             non_empty_examples = list(
                 filter(lambda x: x.get(attribute), training_data.training_examples)
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 291d7d697da6..dcef79f940c9 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -33,10 +33,15 @@ def _load_tokenizer_params(self):
         # needed to load the ConveRT model
         import tensorflow_text
         import tensorflow_hub as tfhub
+        import os
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
 
-        self.module = tfhub.load(model_url)
+        try:
+            self.module = tfhub.load(model_url)
+        except OSError:
+            os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'
+            self.module = tfhub.load(model_url)
 
         self.tokenize_signature = self.module.signatures["tokenize"]
 

From 86237a53900c8bdef9a7826ab1692c2c155eed9a Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Thu, 13 Feb 2020 14:21:24 +0100
Subject: [PATCH 382/633] black reformatted

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +-
 rasa/nlu/tokenizers/convert_tokenizer.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 5c0d65b90049..4043736d5a9f 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -40,7 +40,7 @@ def _load_model(self) -> None:
         try:
             self.module = tfhub.load(model_url)
         except OSError:
-            os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'
+            os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
             self.module = tfhub.load(model_url)
 
         self.sentence_encoding_signature = self.module.signatures["default"]
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index dcef79f940c9..03eaf03b09a6 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -40,7 +40,7 @@ def _load_tokenizer_params(self):
         try:
             self.module = tfhub.load(model_url)
         except OSError:
-            os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'
+            os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
             self.module = tfhub.load(model_url)
 
         self.tokenize_signature = self.module.signatures["tokenize"]

From ddae7aee2f98ff1d1453fa04b12637eaafcf9307 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 15:26:49 +0100
Subject: [PATCH 383/633] refactored tests

---
 rasa/utils/tensorflow/environment.py | 50 +++++++++++++++++-----------
 tests/utils/test_tf_environment.py   | 20 +++++++++--
 2 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index e015a5e3aec3..e39a290dd0b5 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -12,17 +12,14 @@
 logger = logging.getLogger(__name__)
 
 
-def setup_gpu_environment(gpu_memory_config: Text) -> None:
+def setup_gpu_environment() -> None:
+
+    gpu_memory_config = os.getenv(ENV_GPU_CONFIG, None)
 
     if gpu_memory_config:
 
         # Parse GPU config
-        # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
-        # Parse it and store in a dictionary
-        parsed_gpu_config = {
-            instance.split(":")[0].strip(): int(instance.split(":")[1].strip())
-            for instance in gpu_memory_config.split(",")
-        }
+        parsed_gpu_config = parse_gpu_config(gpu_memory_config)
 
         physical_gpus = tf.config.list_physical_devices("GPU")
 
@@ -32,7 +29,7 @@ def setup_gpu_environment(gpu_memory_config: Text) -> None:
             for gpu_id, gpu_id_memory in parsed_gpu_config.items():
                 try:
                     tf.config.experimental.set_virtual_device_configuration(
-                        physical_gpus[int(gpu_id)],
+                        physical_gpus[gpu_id],
                         [
                             tf.config.experimental.VirtualDeviceConfiguration(
                                 memory_limit=gpu_id_memory
@@ -49,13 +46,33 @@ def setup_gpu_environment(gpu_memory_config: Text) -> None:
 
         else:
             logger.info(
-                "You have an environment variable GPU_MEMORY_ALLOC set but no GPUs were detected to configure"
+                f"You have an environment variable '{ENV_GPU_CONFIG}' set but no GPUs were detected to configure"
             )
 
 
-def setup_cpu_environment(
-    inter_op_parallel_threads: Text, intra_op_parallel_threads: Text
-) -> None:
+def parse_gpu_config(gpu_memory_config: Text):
+
+    # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
+    # Parse it and store in a dictionary
+    parsed_gpu_config = {}
+
+    try:
+        for instance in gpu_memory_config.split(","):
+            instance_gpu_id, instance_gpu_mem = instance.split(":")
+            instance_gpu_id = int(instance_gpu_id)
+            instance_gpu_mem = int(instance_gpu_mem)
+
+            parsed_gpu_config[instance_gpu_id] = instance_gpu_mem
+    except ValueError as e:
+        raise e
+
+    return parsed_gpu_config
+
+
+def setup_cpu_environment() -> None:
+
+    inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG, None)
+    intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG, None)
 
     if inter_op_parallel_threads:
         tf.config.threading.set_inter_op_parallelism_threads(
@@ -70,10 +87,5 @@ def setup_cpu_environment(
 
 def setup_tf_environment():
 
-    # Get all env variables
-    gpu_memory_config = os.getenv(ENV_GPU_CONFIG, None)
-    inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG, None)
-    intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG, None)
-
-    setup_gpu_environment(gpu_memory_config)
-    setup_cpu_environment(inter_op_parallel_threads, intra_op_parallel_threads)
+    setup_cpu_environment()
+    setup_gpu_environment()
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index 14c0d2ba2e98..ae2525f7b4bd 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -1,13 +1,27 @@
+import pytest
 import tensorflow as tf
 from rasa.utils.tensorflow.environment import setup_cpu_environment
+from rasa.utils.tensorflow.environment import parse_gpu_config
+from _pytest.monkeypatch import MonkeyPatch
+from rasa.constants import ENV_CPU_INTER_OP_CONFIG, ENV_CPU_INTRA_OP_CONFIG
 
 
 def test_tf_cpu_environment_setting():
 
-    inter_op_threads = "2"
-    intra_op_threads = "3"
+    monkeypatch = MonkeyPatch()
+    monkeypatch.setenv(ENV_CPU_INTRA_OP_CONFIG, "3")
+    monkeypatch.setenv(ENV_CPU_INTER_OP_CONFIG, "2")
 
-    setup_cpu_environment(inter_op_threads, intra_op_threads)
+    setup_cpu_environment()
 
     assert tf.config.threading.get_inter_op_parallelism_threads() == 2
     assert tf.config.threading.get_intra_op_parallelism_threads() == 3
+
+
+@pytest.mark.parametrize(
+    "gpu_config_string, parsed_gpu_config",
+    [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],
+)
+def test_parsed_gpu_config(gpu_config_string, parsed_gpu_config):
+
+    assert parse_gpu_config(gpu_config_string) == parsed_gpu_config

From 4694f603cd85238d580c233fdb7e5c53cba92699 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 13 Feb 2020 15:54:29 +0100
Subject: [PATCH 384/633] fix static seq len in transformer

---
 rasa/utils/tensorflow/transformer.py | 52 ++++++++++++++--------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 6776023eec03..8dfde8a63771 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -422,8 +422,10 @@ def __init__(
         self._embedding = DenseWithSparseWeights(
             units=units, kernel_regularizer=l2_regularizer
         )
-
-        self._pos_encoding = self._positional_encoding(max_seq_length, self.units)
+        # positional encoding helpers
+        self._angles = self._get_angles()
+        self._even_indices = np.arange(0, self.units, 2, dtype=np.int32)[:, np.newaxis]
+        self._odd_indices = np.arange(1, self.units, 2, dtype=np.int32)[:, np.newaxis]
 
         self._dropout = tf.keras.layers.Dropout(dropout_rate)
 
@@ -444,33 +446,32 @@ def __init__(
         ]
         self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
-    @staticmethod
-    def _look_ahead_pad_mask(seq_len: int) -> tf.Tensor:
-        pad_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
-        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
+    def _get_angles(self) -> np.ndarray:
+        i = np.arange(self.units)[np.newaxis, :]
+        return 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.units))
 
-    @staticmethod
-    def _get_angles(pos: np.ndarray, i: np.ndarray, units: int) -> np.ndarray:
-        angle_dropout_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(units))
-        return pos * angle_dropout_rates
-
-    @classmethod
-    def _positional_encoding(cls, max_position: int, units: int) -> tf.Tensor:
-        angle_rads = cls._get_angles(
-            np.arange(max_position)[:, np.newaxis],
-            np.arange(units)[np.newaxis, :],
-            units,
-        )
+    def _positional_encoding(self, max_position: tf.Tensor) -> tf.Tensor:
+        max_position = tf.cast(max_position, dtype=tf.float32)
+        angle_rads = tf.range(max_position)[:, tf.newaxis] * self._angles
 
+        # transpose for easy slicing
+        angle_rads = tf.transpose(angle_rads, [1, 0])
+        shape = tf.shape(angle_rads)
         # apply sin to even indices in the array; 2i
-        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
-
+        sin_even = tf.sin(tf.gather_nd(angle_rads, self._even_indices))
+        pos_encoding_even = tf.scatter_nd(self._even_indices, sin_even, shape)
         # apply cos to odd indices in the array; 2i+1
-        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
-
-        pos_encoding = angle_rads[np.newaxis, ...]
+        cos_odd = tf.cos(tf.gather_nd(angle_rads, self._odd_indices))
+        pos_encoding_odd = tf.scatter_nd(self._odd_indices, cos_odd, shape)
+        # combine even and odd positions and transpose back
+        pos_encoding = tf.transpose(pos_encoding_even + pos_encoding_odd, [1, 0])
+        # add batch dimension
+        return tf.stop_gradient(pos_encoding[tf.newaxis, ...])
 
-        return tf.cast(pos_encoding, dtype=tf.float32)
+    @staticmethod
+    def _look_ahead_pad_mask(max_position: tf.Tensor) -> tf.Tensor:
+        pad_mask = 1 - tf.linalg.band_part(tf.ones((max_position, max_position)), -1, 0)
+        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
 
     def call(
         self,
@@ -484,8 +485,7 @@ def call(
         # adding embedding and position encoding.
         x = self._embedding(x)  # (batch_size, seq_len, units)
         x *= tf.math.sqrt(tf.cast(self.units, tf.float32))
-        if pad_mask is not None:
-            x += self._pos_encoding[:, : tf.shape(x)[1], :] * (1 - pad_mask)
+        x += self._positional_encoding(tf.shape(x)[1])
         x = self._dropout(x, training=training)
 
         if pad_mask is not None:

From c87593303e5778cfb1bea358e5e6c3e1ef167948 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 16:09:48 +0100
Subject: [PATCH 385/633] WIP

---
 rasa/utils/tensorflow/environment.py | 17 +++++++++++++----
 tests/utils/test_tf_environment.py   | 16 +++++++++++-----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index e39a290dd0b5..6ee35dea6bde 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import Text
+from typing import Text, Tuple
 
 import tensorflow as tf
 from rasa.constants import (
@@ -63,13 +63,16 @@ def parse_gpu_config(gpu_memory_config: Text):
             instance_gpu_mem = int(instance_gpu_mem)
 
             parsed_gpu_config[instance_gpu_id] = instance_gpu_mem
-    except ValueError as e:
-        raise e
+    except ValueError:
+        # Add a helper explanation
+        raise ValueError(
+            f"Error parsing GPU configuration. Please cross-check the format of '{ENV_GPU_CONFIG}'"
+        )
 
     return parsed_gpu_config
 
 
-def setup_cpu_environment() -> None:
+def setup_cpu_environment() -> Tuple[int, int]:
 
     inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG, None)
     intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG, None)
@@ -84,6 +87,12 @@ def setup_cpu_environment() -> None:
             int(intra_op_parallel_threads.strip())
         )
 
+    # Returning the actual values as a confirmation. Helps with tests too.
+    return (
+        tf.config.threading.get_inter_op_parallelism_threads(),
+        tf.config.threading.get_intra_op_parallelism_threads(),
+    )
+
 
 def setup_tf_environment():
 
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index ae2525f7b4bd..d23fee5abc32 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -4,16 +4,22 @@
 from rasa.utils.tensorflow.environment import parse_gpu_config
 from _pytest.monkeypatch import MonkeyPatch
 from rasa.constants import ENV_CPU_INTER_OP_CONFIG, ENV_CPU_INTRA_OP_CONFIG
+from typing import Text
+import multiprocessing
 
-
-def test_tf_cpu_environment_setting():
+# @pytest.fixture()
+def tf_cpu_environment_setter(inter_op_config: Text, intra_op_config: Text):
 
     monkeypatch = MonkeyPatch()
-    monkeypatch.setenv(ENV_CPU_INTRA_OP_CONFIG, "3")
-    monkeypatch.setenv(ENV_CPU_INTER_OP_CONFIG, "2")
+    monkeypatch.setenv(ENV_CPU_INTRA_OP_CONFIG, intra_op_config)
+    monkeypatch.setenv(ENV_CPU_INTER_OP_CONFIG, inter_op_config)
+
+    return setup_cpu_environment()
 
-    setup_cpu_environment()
 
+def test_tf_cpu_environment_setting():
+
+    child_process = multiprocessing.Process(target=tf_cpu_environment_setter, args=())
     assert tf.config.threading.get_inter_op_parallelism_threads() == 2
     assert tf.config.threading.get_intra_op_parallelism_threads() == 3
 

From 02950235f0540eb5b61768804c1729d2361e3a07 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 16:16:38 +0100
Subject: [PATCH 386/633] check

---
 rasa/utils/tensorflow/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 77e67fddb739..d4b624e1e00f 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -218,7 +218,7 @@ def _get_tf_functions(
 
         logger.debug(f"Building tensorflow {phase} graph...")
         # allows increasing batch size
-        tf_dataset_function = tf.function(func=dataset_function)
+        tf_dataset_function = dataset_function  # tf.function(func=dataset_function)
 
         init_dataset = tf_dataset_function(tf.ones((), tf.int32))
 

From 2843fe0388ad11e660a95f57092fa4023ba870e3 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 16:40:59 +0100
Subject: [PATCH 387/633] fix test for cpu testing

---
 rasa/utils/tensorflow/models.py    |  2 +-
 tests/utils/test_tf_environment.py | 36 ++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index d4b624e1e00f..77e67fddb739 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -218,7 +218,7 @@ def _get_tf_functions(
 
         logger.debug(f"Building tensorflow {phase} graph...")
         # allows increasing batch size
-        tf_dataset_function = dataset_function  # tf.function(func=dataset_function)
+        tf_dataset_function = tf.function(func=dataset_function)
 
         init_dataset = tf_dataset_function(tf.ones((), tf.int32))
 
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index d23fee5abc32..e93e5a979476 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -1,33 +1,45 @@
 import pytest
-import tensorflow as tf
+from _pytest.monkeypatch import MonkeyPatch
+from typing import Text, Dict
+import multiprocessing
 from rasa.utils.tensorflow.environment import setup_cpu_environment
 from rasa.utils.tensorflow.environment import parse_gpu_config
-from _pytest.monkeypatch import MonkeyPatch
 from rasa.constants import ENV_CPU_INTER_OP_CONFIG, ENV_CPU_INTRA_OP_CONFIG
-from typing import Text
-import multiprocessing
 
-# @pytest.fixture()
-def tf_cpu_environment_setter(inter_op_config: Text, intra_op_config: Text):
+
+def tf_cpu_setter(
+    inter_op_config: Text, intra_op_config: Text, shared_context_output: Dict[Text, int]
+):
 
     monkeypatch = MonkeyPatch()
     monkeypatch.setenv(ENV_CPU_INTRA_OP_CONFIG, intra_op_config)
     monkeypatch.setenv(ENV_CPU_INTER_OP_CONFIG, inter_op_config)
 
-    return setup_cpu_environment()
+    set_inter_op_val, set_intra_op_val = setup_cpu_environment()
+
+    shared_context_output[ENV_CPU_INTER_OP_CONFIG] = set_inter_op_val
+    shared_context_output[ENV_CPU_INTRA_OP_CONFIG] = set_intra_op_val
+
+
+def test_tf_cpu_setting():
 
+    manager = multiprocessing.Manager()
+    shared_context_output = manager.dict()
 
-def test_tf_cpu_environment_setting():
+    child_process = multiprocessing.get_context("spawn").Process(
+        target=tf_cpu_setter, args=("3", "2", shared_context_output)
+    )
+    child_process.start()
+    child_process.join()
 
-    child_process = multiprocessing.Process(target=tf_cpu_environment_setter, args=())
-    assert tf.config.threading.get_inter_op_parallelism_threads() == 2
-    assert tf.config.threading.get_intra_op_parallelism_threads() == 3
+    assert shared_context_output[ENV_CPU_INTER_OP_CONFIG] == 3
+    assert shared_context_output[ENV_CPU_INTRA_OP_CONFIG] == 2
 
 
 @pytest.mark.parametrize(
     "gpu_config_string, parsed_gpu_config",
     [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],
 )
-def test_parsed_gpu_config(gpu_config_string, parsed_gpu_config):
+def test_gpu_config_parser(gpu_config_string, parsed_gpu_config):
 
     assert parse_gpu_config(gpu_config_string) == parsed_gpu_config

From 935537c190fd098cf6beb762fbb6635eaf109290 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 13 Feb 2020 16:42:52 +0100
Subject: [PATCH 388/633] don't create tf faunction for dataset

---
 rasa/utils/tensorflow/model_data.py |  5 +--
 rasa/utils/tensorflow/models.py     | 66 +++++++++++++----------------
 2 files changed, 30 insertions(+), 41 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index b4918a16b9c3..5a8cc16df705 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -304,10 +304,7 @@ def batch_tuple_sizes(self) -> Dict[Text, int]:
         return tuple_sizes
 
     def as_tf_dataset(
-        self,
-        batch_size: Union[tf.Tensor, int],
-        batch_strategy: Text = "sequence",
-        shuffle: bool = False,
+        self, batch_size: int, batch_strategy: Text = "sequence", shuffle: bool = False
     ) -> tf.data.Dataset:
         """Create tf dataset."""
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 77e67fddb739..cd40519fec05 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -70,12 +70,12 @@ def fit(
             )
 
         (
-            tf_train_dataset_function,
+            train_dataset_function,
             tf_train_on_batch_function,
         ) = self._get_tf_train_functions(eager, model_data, batch_strategy)
 
         (
-            tf_evaluation_dataset_function,
+            evaluation_dataset_function,
             tf_evaluation_on_batch_function,
         ) = self._get_tf_evaluation_functions(
             eager, evaluate_on_num_examples, evaluation_model_data
@@ -86,14 +86,9 @@ def fit(
 
         for ep in pbar:
             ep_batch_size = self.linearly_increasing_batch_size(ep, batch_size, epochs)
-            if not eager:
-                ep_batch_size *= tf.ones((), tf.int32)
 
             self._batch_loop(
-                tf_train_dataset_function,
-                tf_train_on_batch_function,
-                ep_batch_size,
-                True,
+                train_dataset_function, tf_train_on_batch_function, ep_batch_size, True
             )
 
             postfix_dict = self._get_metric_results()
@@ -101,7 +96,7 @@ def fit(
             if evaluate_on_num_examples > 0:
                 if self._should_evaluate(evaluate_every_num_epochs, epochs, ep):
                     self._batch_loop(
-                        tf_evaluation_dataset_function,
+                        evaluation_dataset_function,
                         tf_evaluation_on_batch_function,
                         ep_batch_size,
                         False,
@@ -130,14 +125,9 @@ def train_on_batch(
     def build_for_predict(
         self, predict_data: RasaModelData, eager: bool = False
     ) -> None:
-        def predict_dataset_function(  # to reuse the same helper method
-            _batch_size: Union[tf.Tensor, int]
-        ) -> tf.data.Dataset:
-            return predict_data.as_tf_dataset(_batch_size, "sequence", shuffle=False)
-
         self._training = False  # needed for tf graph mode
-        _, self._predict_function = self._get_tf_functions(
-            predict_dataset_function, self.batch_predict, eager, "prediction"
+        self._predict_function = self._get_tf_call_model_function(
+            predict_data.as_tf_dataset, self.batch_predict, eager, "prediction"
         )
 
     def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
@@ -194,7 +184,7 @@ def _batch_loop(
         self,
         dataset_function: Callable,
         call_model_function: Callable,
-        batch_size: Union[tf.Tensor, int],
+        batch_size: int,
         training: bool,
     ) -> None:
         """Run on batches"""
@@ -205,45 +195,43 @@ def _batch_loop(
             call_model_function(batch_in)
 
     @staticmethod
-    def _get_tf_functions(
+    def _get_tf_call_model_function(
         dataset_function: Callable,
         call_model_function: Callable,
         eager: bool,
         phase: Text,
-    ) -> Tuple[Callable, Callable]:
+    ) -> Callable:
         """Convert functions to tensorflow functions"""
 
         if eager:
-            return dataset_function, call_model_function
+            return call_model_function
 
         logger.debug(f"Building tensorflow {phase} graph...")
-        # allows increasing batch size
-        tf_dataset_function = tf.function(func=dataset_function)
 
-        init_dataset = tf_dataset_function(tf.ones((), tf.int32))
-
-        tf_method_function = tf.function(
+        init_dataset = dataset_function(1)
+        tf_call_model_function = tf.function(
             call_model_function, input_signature=[init_dataset.element_spec]
         )
-        tf_method_function(next(iter(init_dataset)))
+        tf_call_model_function(next(iter(init_dataset)))
 
         logger.debug(f"Finished building tensorflow {phase} graph.")
 
-        return tf_dataset_function, tf_method_function
+        return tf_call_model_function
 
     def _get_tf_train_functions(
         self, eager: bool, model_data: RasaModelData, batch_strategy: Text
     ) -> Tuple[Callable, Callable]:
         """Create train tensorflow functions"""
 
-        def train_dataset_function(
-            _batch_size: Union[tf.Tensor, int]
-        ) -> tf.data.Dataset:
+        def train_dataset_function(_batch_size: int) -> tf.data.Dataset:
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
 
         self._training = True  # needed for tf graph mode
-        return self._get_tf_functions(
-            train_dataset_function, self.train_on_batch, eager, "train"
+        return (
+            train_dataset_function,
+            self._get_tf_call_model_function(
+                train_dataset_function, self.train_on_batch, eager, "train"
+            ),
         )
 
     def _get_tf_evaluation_functions(
@@ -256,16 +244,20 @@ def _get_tf_evaluation_functions(
 
         if evaluate_on_num_examples > 0:
 
-            def evaluation_dataset_function(
-                _batch_size: Union[tf.Tensor, int]
-            ) -> tf.data.Dataset:
+            def evaluation_dataset_function(_batch_size: int) -> tf.data.Dataset:
                 return evaluation_model_data.as_tf_dataset(
                     _batch_size, "sequence", shuffle=False
                 )
 
             self._training = False  # needed for tf graph mode
-            return self._get_tf_functions(
-                evaluation_dataset_function, self._total_batch_loss, eager, "evaluation"
+            return (
+                evaluation_dataset_function,
+                self._get_tf_call_model_function(
+                    evaluation_dataset_function,
+                    self._total_batch_loss,
+                    eager,
+                    "evaluation",
+                ),
             )
 
         return None, None

From 7bf37cf4b8b96092627aea6d995b029112eb76e6 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 13 Feb 2020 16:56:35 +0100
Subject: [PATCH 389/633] refactor evaluation function creation

---
 rasa/utils/tensorflow/models.py | 41 ++++++++++++++-------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index cd40519fec05..7318a6779e7c 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -73,13 +73,10 @@ def fit(
             train_dataset_function,
             tf_train_on_batch_function,
         ) = self._get_tf_train_functions(eager, model_data, batch_strategy)
-
         (
             evaluation_dataset_function,
             tf_evaluation_on_batch_function,
-        ) = self._get_tf_evaluation_functions(
-            eager, evaluate_on_num_examples, evaluation_model_data
-        )
+        ) = self._get_tf_evaluation_functions(eager, evaluation_model_data)
 
         val_results = {}  # validation is not performed every epoch
         pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
@@ -235,32 +232,28 @@ def train_dataset_function(_batch_size: int) -> tf.data.Dataset:
         )
 
     def _get_tf_evaluation_functions(
-        self,
-        eager: bool,
-        evaluate_on_num_examples: int,
-        evaluation_model_data: RasaModelData,
+        self, eager: bool, evaluation_model_data: Optional[RasaModelData],
     ) -> Tuple[Optional[Callable], Optional[Callable]]:
         """Create evaluation tensorflow functions"""
 
-        if evaluate_on_num_examples > 0:
-
-            def evaluation_dataset_function(_batch_size: int) -> tf.data.Dataset:
-                return evaluation_model_data.as_tf_dataset(
-                    _batch_size, "sequence", shuffle=False
-                )
+        if evaluation_model_data is None:
+            return None, None
 
-            self._training = False  # needed for tf graph mode
-            return (
-                evaluation_dataset_function,
-                self._get_tf_call_model_function(
-                    evaluation_dataset_function,
-                    self._total_batch_loss,
-                    eager,
-                    "evaluation",
-                ),
+        def evaluation_dataset_function(_batch_size: int) -> tf.data.Dataset:
+            return evaluation_model_data.as_tf_dataset(
+                _batch_size, "sequence", shuffle=False
             )
 
-        return None, None
+        self._training = False  # needed for tf graph mode
+        return (
+            evaluation_dataset_function,
+            self._get_tf_call_model_function(
+                evaluation_dataset_function,
+                self._total_batch_loss,
+                eager,
+                "evaluation",
+            ),
+        )
 
     def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]:
         """Get the metrics results"""

From 618ffae3edfdbef90671e1f28deff855df53355e Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Thu, 13 Feb 2020 17:09:31 +0100
Subject: [PATCH 390/633] tqdm over traiing examples

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 4043736d5a9f..9da16903c6cf 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -185,7 +185,7 @@ def train(
 
         batch_size = 64
 
-        for attribute in tqdm(DENSE_FEATURIZABLE_ATTRIBUTES):
+        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
 
             non_empty_examples = list(
                 filter(lambda x: x.get(attribute), training_data.training_examples)
@@ -204,7 +204,7 @@ def train(
 
                 batch_features = self._compute_features(batch_examples, attribute)
 
-                for index, ex in enumerate(batch_examples):
+                for index, ex in tqdm(enumerate(batch_examples)):
 
                     ex.set(
                         DENSE_FEATURE_NAMES[attribute],

From ef527c001fdb948810f3d7c7937176132cb9c4c6 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 17:38:49 +0100
Subject: [PATCH 391/633] address comments

---
 changelog/5230.feature.rst           |  5 +--
 docs/api/tensorflow_usage.rst        | 51 +++++++++++++---------------
 rasa/__main__.py                     |  5 ++-
 rasa/utils/tensorflow/environment.py | 50 ++++++++++++++++-----------
 4 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/changelog/5230.feature.rst b/changelog/5230.feature.rst
index 5d7599047663..c22e6f5f161a 100644
--- a/changelog/5230.feature.rst
+++ b/changelog/5230.feature.rst
@@ -1,6 +1,7 @@
-Refactor how GPU and CPU environments are configured for tensorflow 2.0
+Refactor how GPU and CPU environments are configured for TensorFlow 2.0
 
-Environment variables to set and description is shown in the example below:
+Please refer to the `documentation <https://rasa.com/docs/rasa/api/tensorflow_usage.html>`_ to understand
+which environment variables to set in what scenarios. A couple of examples are shown below as well:
 
 .. code-block:: python
 
diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
index 8ecc0473a677..cb1dd5b9fba6 100644
--- a/docs/api/tensorflow_usage.rst
+++ b/docs/api/tensorflow_usage.rst
@@ -1,44 +1,43 @@
-:desc: Find out how to configure your environment for efficient usage of tensorflow inside Rasa
+:desc: Find out how to configure your environment for efficient usage of TensorFlow inside Rasa OSS
 
 .. _tensorflow_usage:
 
-Setting up Tensorflow Runtime
-=============================
+Setting up the TensorFlow Runtime
+=================================
 
-Tensorflow allows setting the runtime environment via
-`TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa supports a smaller subset of these
-configuration options and makes appropriate calls to the `tf.config` submodule.
-This smaller subset comprises of configurations that developers frequently use with Rasa.
+TensorFlow allows setting the runtime environment via
+`TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa OSS supports a smaller subset of these
+configuration options and makes appropriate calls to the ``tf.config`` submodule.
+This smaller subset comprises of configurations that developers frequently use with Rasa OSS.
 All configuration options are specified using environment variables as shown in subsequent sections.
 
+Optimizing CPU Performance
+--------------------------
 
-Optimize CPU Performance
-------------------------
-
-Parallelize one operation
-^^^^^^^^^^^^^^^^^^^^^^^^^
+Parallelizing one operation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
-to parallelize the execution of one operation. If left unspecified, this value defaults to 0 which means tensorflow should
+to parallelize the execution of one operation. If left unspecified, this value defaults to ``0`` which means TensorFlow should
 pick an appropriate value depending on the system configuration.
 
 
-Parallelize multiple operations
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Parallelizing multiple operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
-to parallelize the execution of multiple **non-blocking** operations. If left unspecified, this value defaults to 0
-which means tensorflow should pick an appropriate value depending on the system configuration.
+to parallelize the execution of multiple **non-blocking** operations. If left unspecified, this value defaults to ``0``
+which means TensorFlow should pick an appropriate value depending on the system configuration.
 
 
-Optimize GPU Performance
-------------------------
+Optimizing GPU Performance
+--------------------------
 
 Limiting GPU memory growth
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Tensorflow by default blocks all the available GPU memory for the running process. This can be limiting if you are running
-multiple tensorflow processes and want to distribute memory across them. To prevent this,
+TensorFlow by default blocks all the available GPU memory for the running process. This can be limiting if you are running
+multiple TensorFlow processes and want to distribute memory across them. To prevent this,
 set an environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
 
 
@@ -47,12 +46,10 @@ Restricting absolute GPU memory available
 
 Often, a developer wants to limit the absolute amount of GPU memory that can be used by a process.
 
-For example, you may have two visible GPUs(GPU:0 and GPU:1) and you want to allocate 1024 MB from first GPU and 2048 MB from second GPU.
+For example, you may have two visible GPUs(``GPU:0`` and ``GPU:1``) and you want to allocate 1024 MB from the first GPU
+and 2048 MB from the second GPU.
 You can do so by setting an environment variable as ``TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"``.
 
-Another scenario can be where you have access to 2 GPUs(GPU:0 and GPU:1) but you would like to use only second GPU for
-Rasa process.
+Another scenario can be where you have access to 2 GPUs(``GPU:0`` and ``GPU:1``) but you would like to use only second
+GPU for the Rasa OSS process.
 ``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory from GPU 1 available for the Rasa process
-
-
-
diff --git a/rasa/__main__.py b/rasa/__main__.py
index 482807f1f261..b3c17edae862 100644
--- a/rasa/__main__.py
+++ b/rasa/__main__.py
@@ -8,7 +8,7 @@
 from rasa.cli.arguments.default_arguments import add_logging_options
 from rasa.cli.utils import parse_last_positional_argument_as_model_path
 from rasa.utils.common import set_log_level
-from rasa.utils.tensorflow.environment import setup_tf_environment
+import rasa.utils.tensorflow.environment as tf_env
 
 logger = logging.getLogger(__name__)
 
@@ -69,8 +69,7 @@ def main() -> None:
     )
     set_log_level(log_level)
 
-    # Set tensorflow environment
-    setup_tf_environment()
+    tf_env.setup_tf_environment()
 
     # insert current path in syspath so custom modules are found
     sys.path.insert(1, os.getcwd())
diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 6ee35dea6bde..84ea0bbdab5b 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -1,37 +1,33 @@
 import logging
 import os
 from typing import Text, Tuple
-
-import tensorflow as tf
+import warnings
 from rasa.constants import (
     ENV_GPU_CONFIG,
     ENV_CPU_INTER_OP_CONFIG,
     ENV_CPU_INTRA_OP_CONFIG,
 )
+from tensorflow import config as tf_config
 
 logger = logging.getLogger(__name__)
 
 
 def setup_gpu_environment() -> None:
+    """Set configuration for a GPU environment based on the environment variable set"""
 
     gpu_memory_config = os.getenv(ENV_GPU_CONFIG, None)
-
     if gpu_memory_config:
-
-        # Parse GPU config
         parsed_gpu_config = parse_gpu_config(gpu_memory_config)
-
-        physical_gpus = tf.config.list_physical_devices("GPU")
+        physical_gpus = tf_config.list_physical_devices("GPU")
 
         # Logic taken from https://www.tensorflow.org/guide/gpu
         if physical_gpus:
-
             for gpu_id, gpu_id_memory in parsed_gpu_config.items():
                 try:
-                    tf.config.experimental.set_virtual_device_configuration(
+                    tf_config.experimental.set_virtual_device_configuration(
                         physical_gpus[gpu_id],
                         [
-                            tf.config.experimental.VirtualDeviceConfiguration(
+                            tf_config.experimental.VirtualDeviceConfiguration(
                                 memory_limit=gpu_id_memory
                             )
                         ],
@@ -45,12 +41,13 @@ def setup_gpu_environment() -> None:
                     )
 
         else:
-            logger.info(
+            warnings.warn(
                 f"You have an environment variable '{ENV_GPU_CONFIG}' set but no GPUs were detected to configure"
             )
 
 
 def parse_gpu_config(gpu_memory_config: Text):
+    """Parse GPU configuration variable from a string to a dict"""
 
     # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
     # Parse it and store in a dictionary
@@ -73,24 +70,39 @@ def parse_gpu_config(gpu_memory_config: Text):
 
 
 def setup_cpu_environment() -> Tuple[int, int]:
+    """Set configuration for the CPU environment based on the environment variable set"""
 
     inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG, None)
     intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG, None)
 
     if inter_op_parallel_threads:
-        tf.config.threading.set_inter_op_parallelism_threads(
-            int(inter_op_parallel_threads.strip())
-        )
+
+        try:
+            inter_op_parallel_threads = int(inter_op_parallel_threads.strip())
+        except ValueError:
+            raise ValueError(
+                f"Error parsing the environment variable '{ENV_CPU_INTER_OP_CONFIG}'. Please "
+                f"cross-check the value"
+            )
+
+        tf_config.threading.set_inter_op_parallelism_threads(inter_op_parallel_threads)
 
     if intra_op_parallel_threads:
-        tf.config.threading.set_intra_op_parallelism_threads(
-            int(intra_op_parallel_threads.strip())
-        )
+
+        try:
+            intra_op_parallel_threads = int(intra_op_parallel_threads.strip())
+        except ValueError:
+            raise ValueError(
+                f"Error parsing the environment variable '{ENV_CPU_INTRA_OP_CONFIG}'. Please "
+                f"cross-check the value"
+            )
+
+        tf_config.threading.set_intra_op_parallelism_threads(intra_op_parallel_threads)
 
     # Returning the actual values as a confirmation. Helps with tests too.
     return (
-        tf.config.threading.get_inter_op_parallelism_threads(),
-        tf.config.threading.get_intra_op_parallelism_threads(),
+        tf_config.threading.get_inter_op_parallelism_threads(),
+        tf_config.threading.get_intra_op_parallelism_threads(),
     )
 
 

From 879bd0db250213a443382962a12e9b77d295332d Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 17:42:34 +0100
Subject: [PATCH 392/633] fix type annotations

---
 rasa/utils/tensorflow/environment.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 84ea0bbdab5b..46745f25d61a 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import Text, Tuple
+from typing import Text, Tuple, Dict
 import warnings
 from rasa.constants import (
     ENV_GPU_CONFIG,
@@ -15,7 +15,7 @@
 def setup_gpu_environment() -> None:
     """Set configuration for a GPU environment based on the environment variable set"""
 
-    gpu_memory_config = os.getenv(ENV_GPU_CONFIG, None)
+    gpu_memory_config = os.getenv(ENV_GPU_CONFIG)
     if gpu_memory_config:
         parsed_gpu_config = parse_gpu_config(gpu_memory_config)
         physical_gpus = tf_config.list_physical_devices("GPU")
@@ -46,7 +46,7 @@ def setup_gpu_environment() -> None:
             )
 
 
-def parse_gpu_config(gpu_memory_config: Text):
+def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
     """Parse GPU configuration variable from a string to a dict"""
 
     # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
@@ -72,8 +72,8 @@ def parse_gpu_config(gpu_memory_config: Text):
 def setup_cpu_environment() -> Tuple[int, int]:
     """Set configuration for the CPU environment based on the environment variable set"""
 
-    inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG, None)
-    intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG, None)
+    inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG)
+    intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG)
 
     if inter_op_parallel_threads:
 
@@ -106,7 +106,7 @@ def setup_cpu_environment() -> Tuple[int, int]:
     )
 
 
-def setup_tf_environment():
+def setup_tf_environment() -> None:
 
     setup_cpu_environment()
     setup_gpu_environment()

From 0fec427357a174daa1360251536442d37f166f3d Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 13 Feb 2020 17:54:50 +0100
Subject: [PATCH 393/633] fix imports

---
 rasa/utils/tensorflow/environment.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 46745f25d61a..fe200ce64dd1 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -7,7 +7,6 @@
     ENV_CPU_INTER_OP_CONFIG,
     ENV_CPU_INTRA_OP_CONFIG,
 )
-from tensorflow import config as tf_config
 
 logger = logging.getLogger(__name__)
 
@@ -15,6 +14,8 @@
 def setup_gpu_environment() -> None:
     """Set configuration for a GPU environment based on the environment variable set"""
 
+    from tensorflow import config as tf_config
+
     gpu_memory_config = os.getenv(ENV_GPU_CONFIG)
     if gpu_memory_config:
         parsed_gpu_config = parse_gpu_config(gpu_memory_config)
@@ -72,6 +73,8 @@ def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
 def setup_cpu_environment() -> Tuple[int, int]:
     """Set configuration for the CPU environment based on the environment variable set"""
 
+    from tensorflow import config as tf_config
+
     inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG)
     intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG)
 

From eea5231b321affb3e90d4c1598acda4cbdcc276a Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Thu, 13 Feb 2020 18:05:03 +0100
Subject: [PATCH 394/633] comments; better tqdm with description

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 6 +++++-
 rasa/nlu/tokenizers/convert_tokenizer.py                    | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 9da16903c6cf..c5f5b6bc03f8 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -37,6 +37,8 @@ def _load_model(self) -> None:
         import os
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+        # required to take care of cases when other files are already 
+        # stored in the default TFHUB_CACHE_DIR
         try:
             self.module = tfhub.load(model_url)
         except OSError:
@@ -192,8 +194,10 @@ def train(
             )
 
             batch_start_index = 0
+            pbar = tqdm(total= (len(non_empty_examples) // batch_size) + 1, desc = attribute.capitalize() + ' batches:')
 
             while batch_start_index < len(non_empty_examples):
+                pbar.update(1)
 
                 batch_end_index = min(
                     batch_start_index + batch_size, len(non_empty_examples)
@@ -204,7 +208,7 @@ def train(
 
                 batch_features = self._compute_features(batch_examples, attribute)
 
-                for index, ex in tqdm(enumerate(batch_examples)):
+                for index, ex in enumerate(batch_examples):
 
                     ex.set(
                         DENSE_FEATURE_NAMES[attribute],
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 03eaf03b09a6..bb7a05866d6b 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -37,6 +37,8 @@ def _load_tokenizer_params(self):
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
 
+        # required to take care of cases when other files are already 
+        # stored in the default TFHUB_CACHE_DIR
         try:
             self.module = tfhub.load(model_url)
         except OSError:

From fc71ed7de594d0f1e191f9e583c6c13c1289f59c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 14 Feb 2020 10:33:22 +0100
Subject: [PATCH 395/633] change crf extractor defaults

---
 rasa/nlu/extractors/crf_entity_extractor.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 49c4df99d067..02e36c023b6d 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -30,8 +30,6 @@
     DROPRATE,
     REGULARIZATION_CONSTANT,
     BILOU_FLAG,
-    SHARE_HIDDEN_LAYERS,
-    UNIDIRECTIONAL_ENCODER,
 )
 from rasa.utils.common import raise_warning
 from rasa.utils.tensorflow.models import RasaModel
@@ -72,13 +70,13 @@ class CRFEntityExtractor(DIETClassifier):
         # nn architecture
         # sizes of hidden layers before the embedding layer for input words
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128]},
+        HIDDEN_LAYERS_SIZES: {TEXT: []},
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
         BATCH_SIZES: [64, 256],
         # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        BATCH_STRATEGY: "sequence",  # string 'sequence' or 'balanced'
         # number of epochs
         EPOCHS: 300,
         # set random seed to any int to get reproducible results
@@ -103,7 +101,7 @@ class CRFEntityExtractor(DIETClassifier):
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
-        BILOU_FLAG: False,
+        BILOU_FLAG: True,
     }
     # end default properties (DOC MARKER - don't remove)
 

From 5a88cd50201f080c1cc617d0546c0be0b0968b71 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 14 Feb 2020 12:06:47 +0100
Subject: [PATCH 396/633] test just one pytest file on travis

---
 Makefile                           | 2 +-
 tests/utils/test_tf_environment.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 6f2aefe7128e..92997ee369ff 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ prepare-tests-files:
 
 test: clean
 	# OMP_NUM_THREADS can improve overral performance using one thread by process (on tensorflow), avoiding overload
-	OMP_NUM_THREADS=1 pytest tests -n $(JOBS) --cov rasa
+	OMP_NUM_THREADS=1 pytest tests/utils/test_tf_environment.py -n $(JOBS) --cov rasa
 
 doctest: clean
 	cd docs && make doctest
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index e93e5a979476..f4dc24961662 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -23,7 +23,7 @@ def tf_cpu_setter(
 
 def test_tf_cpu_setting():
 
-    manager = multiprocessing.Manager()
+    manager = multiprocessing.get_context("spawn").Manager()
     shared_context_output = manager.dict()
 
     child_process = multiprocessing.get_context("spawn").Process(

From 6f2821834a54edfbaddeb10766e1213aac40363c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 14 Feb 2020 13:13:50 +0100
Subject: [PATCH 397/633] added all tests to pytest now

---
 Makefile                      |  2 +-
 docs/api/tensorflow_usage.rst | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 92997ee369ff..6f2aefe7128e 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ prepare-tests-files:
 
 test: clean
 	# OMP_NUM_THREADS can improve overral performance using one thread by process (on tensorflow), avoiding overload
-	OMP_NUM_THREADS=1 pytest tests/utils/test_tf_environment.py -n $(JOBS) --cov rasa
+	OMP_NUM_THREADS=1 pytest tests -n $(JOBS) --cov rasa
 
 doctest: clean
 	cd docs && make doctest
diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
index cb1dd5b9fba6..c2eddcfcf0a3 100644
--- a/docs/api/tensorflow_usage.rst
+++ b/docs/api/tensorflow_usage.rst
@@ -1,4 +1,4 @@
-:desc: Find out how to configure your environment for efficient usage of TensorFlow inside Rasa OSS
+:desc: Find out how to configure your environment for efficient usage of TensorFlow inside Rasa Open Source
 
 .. _tensorflow_usage:
 
@@ -6,9 +6,9 @@ Setting up the TensorFlow Runtime
 =================================
 
 TensorFlow allows setting the runtime environment via
-`TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa OSS supports a smaller subset of these
+`TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa Open Source supports a smaller subset of these
 configuration options and makes appropriate calls to the ``tf.config`` submodule.
-This smaller subset comprises of configurations that developers frequently use with Rasa OSS.
+This smaller subset comprises of configurations that developers frequently use with Rasa Open Source.
 All configuration options are specified using environment variables as shown in subsequent sections.
 
 Optimizing CPU Performance
@@ -50,6 +50,6 @@ For example, you may have two visible GPUs(``GPU:0`` and ``GPU:1``) and you want
 and 2048 MB from the second GPU.
 You can do so by setting an environment variable as ``TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"``.
 
-Another scenario can be where you have access to 2 GPUs(``GPU:0`` and ``GPU:1``) but you would like to use only second
-GPU for the Rasa OSS process.
-``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory from GPU 1 available for the Rasa process
+Another scenario can be where you have access to 2 GPUs(``GPU:0`` and ``GPU:1``) but you would like to use only the second
+GPU.
+``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory availble from GPU 1.

From af6f03723504e57ff057c96161fdf84d0e54a9f5 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 14 Feb 2020 15:26:17 +0100
Subject: [PATCH 398/633] trying to fix the test. Multiprocessing :(

---
 tests/utils/test_tf_environment.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index f4dc24961662..c0b8d88ab6ed 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -23,10 +23,9 @@ def tf_cpu_setter(
 
 def test_tf_cpu_setting():
 
-    manager = multiprocessing.get_context("spawn").Manager()
-    shared_context_output = manager.dict()
+    shared_context_output = multiprocessing.Manager().dict()
 
-    child_process = multiprocessing.get_context("spawn").Process(
+    child_process = multiprocessing.Process(
         target=tf_cpu_setter, args=("3", "2", shared_context_output)
     )
     child_process.start()

From 792a6a9a5722669e197c2e1041fade48204d6331 Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Fri, 14 Feb 2020 15:59:17 +0100
Subject: [PATCH 399/633] response selector settings

---
 rasa/nlu/selectors/response_selector.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 25123eb3110b..f5b0097435fe 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -94,13 +94,13 @@ class ResponseSelector(DIETClassifier):
         # sizes of hidden layers before the embedding layer
         # for input words and responses
         # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
+        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
         # Whether to share the hidden layer weights between input words and intent labels
         SHARE_HIDDEN_LAYERS: False,
         # number of units in transformer
-        TRANSFORMER_SIZE: 256,
+        TRANSFORMER_SIZE: None,
         # number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 2,
+        NUM_TRANSFORMER_LAYERS: 0,
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # max sequence length if pos_encoding='emb'
@@ -154,7 +154,7 @@ class ResponseSelector(DIETClassifier):
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: True,
+        SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
         # how often to calculate training accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance

From 61fb73173aa1855844c0c05033bf1509222c62e2 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 14 Feb 2020 17:34:21 +0100
Subject: [PATCH 400/633] add name to kernel_mask

---
 rasa/utils/tensorflow/layers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index cc69955e1873..a2bf2e192e2d 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -77,7 +77,9 @@ def build(self, input_shape: tf.TensorShape) -> None:
         kernel_mask = tf.cast(
             tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
         )
-        self.kernel_mask = tf.Variable(initial_value=kernel_mask, trainable=False)
+        self.kernel_mask = tf.Variable(
+            initial_value=kernel_mask, trainable=False, name="kernel_mask"
+        )
 
     def call(self, inputs: tf.Tensor) -> tf.Tensor:
         # set some weights to 0 according to precomputed mask

From 19b700ce78065bc6ad9093eef181a1941dcd82d6 Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Mon, 17 Feb 2020 09:53:25 +0100
Subject: [PATCH 401/633] black formatting

---
 .../nlu/featurizers/dense_featurizer/convert_featurizer.py | 7 +++++--
 rasa/nlu/tokenizers/convert_tokenizer.py                   | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index c5f5b6bc03f8..e2bc527302b8 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -37,7 +37,7 @@ def _load_model(self) -> None:
         import os
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-        # required to take care of cases when other files are already 
+        # required to take care of cases when other files are already
         # stored in the default TFHUB_CACHE_DIR
         try:
             self.module = tfhub.load(model_url)
@@ -194,7 +194,10 @@ def train(
             )
 
             batch_start_index = 0
-            pbar = tqdm(total= (len(non_empty_examples) // batch_size) + 1, desc = attribute.capitalize() + ' batches:')
+            pbar = tqdm(
+                total=(len(non_empty_examples) // batch_size) + 1,
+                desc=attribute.capitalize() + " batches:",
+            )
 
             while batch_start_index < len(non_empty_examples):
                 pbar.update(1)
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index bb7a05866d6b..12c23b744d33 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -37,7 +37,7 @@ def _load_tokenizer_params(self):
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
 
-        # required to take care of cases when other files are already 
+        # required to take care of cases when other files are already
         # stored in the default TFHUB_CACHE_DIR
         try:
             self.module = tfhub.load(model_url)

From 98c43b25e586ed8193b898e700a195fb91942cc3 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 11:39:45 +0100
Subject: [PATCH 402/633] remove cpu env set test

---
 rasa/utils/tensorflow/environment.py | 50 +++++++++++++++++-----------
 tests/utils/test_tf_environment.py   | 28 ----------------
 2 files changed, 30 insertions(+), 48 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index fe200ce64dd1..a8366b2f0b0a 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -1,7 +1,7 @@
 import logging
 import os
 from typing import Text, Tuple, Dict
-import warnings
+import rasa.utils.common as rasa_utils
 from rasa.constants import (
     ENV_GPU_CONFIG,
     ENV_CPU_INTER_OP_CONFIG,
@@ -14,39 +14,49 @@
 def setup_gpu_environment() -> None:
     """Set configuration for a GPU environment based on the environment variable set"""
 
-    from tensorflow import config as tf_config
-
     gpu_memory_config = os.getenv(ENV_GPU_CONFIG)
     if gpu_memory_config:
+
+        # Import from tensorflow only if necessary(environment variable was set)
+        from tensorflow import config as tf_config
+
         parsed_gpu_config = parse_gpu_config(gpu_memory_config)
         physical_gpus = tf_config.list_physical_devices("GPU")
 
         # Logic taken from https://www.tensorflow.org/guide/gpu
         if physical_gpus:
             for gpu_id, gpu_id_memory in parsed_gpu_config.items():
-                try:
-                    tf_config.experimental.set_virtual_device_configuration(
-                        physical_gpus[gpu_id],
-                        [
-                            tf_config.experimental.VirtualDeviceConfiguration(
-                                memory_limit=gpu_id_memory
-                            )
-                        ],
-                    )
-
-                except RuntimeError:
-                    # Add a helper explanation where the error comes from
-                    raise RuntimeError(
-                        "Error while setting up tensorflow environment. "
-                        "Virtual devices must be set before GPUs have been initialized"
-                    )
+
+                allocate_gpu_memory(physical_gpus[gpu_id], gpu_id_memory)
 
         else:
-            warnings.warn(
+            rasa_utils.raise_warning(
                 f"You have an environment variable '{ENV_GPU_CONFIG}' set but no GPUs were detected to configure"
             )
 
 
+def allocate_gpu_memory(gpu_instance, logical_memory: int) -> None:
+
+    from tensorflow import config as tf_config
+
+    try:
+        tf_config.experimental.set_virtual_device_configuration(
+            gpu_instance,
+            [
+                tf_config.experimental.VirtualDeviceConfiguration(
+                    memory_limit=logical_memory
+                )
+            ],
+        )
+
+    except RuntimeError:
+        # Add a helper explanation where the error comes from
+        raise RuntimeError(
+            "Error while setting up tensorflow environment. "
+            "Virtual devices must be set before GPUs have been initialized"
+        )
+
+
 def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
     """Parse GPU configuration variable from a string to a dict"""
 
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index c0b8d88ab6ed..2f8024ae6f52 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -7,34 +7,6 @@
 from rasa.constants import ENV_CPU_INTER_OP_CONFIG, ENV_CPU_INTRA_OP_CONFIG
 
 
-def tf_cpu_setter(
-    inter_op_config: Text, intra_op_config: Text, shared_context_output: Dict[Text, int]
-):
-
-    monkeypatch = MonkeyPatch()
-    monkeypatch.setenv(ENV_CPU_INTRA_OP_CONFIG, intra_op_config)
-    monkeypatch.setenv(ENV_CPU_INTER_OP_CONFIG, inter_op_config)
-
-    set_inter_op_val, set_intra_op_val = setup_cpu_environment()
-
-    shared_context_output[ENV_CPU_INTER_OP_CONFIG] = set_inter_op_val
-    shared_context_output[ENV_CPU_INTRA_OP_CONFIG] = set_intra_op_val
-
-
-def test_tf_cpu_setting():
-
-    shared_context_output = multiprocessing.Manager().dict()
-
-    child_process = multiprocessing.Process(
-        target=tf_cpu_setter, args=("3", "2", shared_context_output)
-    )
-    child_process.start()
-    child_process.join()
-
-    assert shared_context_output[ENV_CPU_INTER_OP_CONFIG] == 3
-    assert shared_context_output[ENV_CPU_INTRA_OP_CONFIG] == 2
-
-
 @pytest.mark.parametrize(
     "gpu_config_string, parsed_gpu_config",
     [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],

From d970ae9844b60caec19856d125d76bba65a106e8 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 11:40:23 +0100
Subject: [PATCH 403/633] remove unused imports

---
 tests/utils/test_tf_environment.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index 2f8024ae6f52..f44eed243f7c 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -1,10 +1,5 @@
 import pytest
-from _pytest.monkeypatch import MonkeyPatch
-from typing import Text, Dict
-import multiprocessing
-from rasa.utils.tensorflow.environment import setup_cpu_environment
 from rasa.utils.tensorflow.environment import parse_gpu_config
-from rasa.constants import ENV_CPU_INTER_OP_CONFIG, ENV_CPU_INTRA_OP_CONFIG
 
 
 @pytest.mark.parametrize(

From a96159b0d6be6d0098f068ea852f63eec373b090 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 11:55:59 +0100
Subject: [PATCH 404/633] refactor function

---
 rasa/utils/tensorflow/environment.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index a8366b2f0b0a..52016da934fd 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -1,6 +1,11 @@
 import logging
 import os
-from typing import Text, Tuple, Dict
+from typing import Text, Dict
+import typing
+
+if typing.TYPE_CHECKING:
+    from tensorflow import config as tf_config
+
 import rasa.utils.common as rasa_utils
 from rasa.constants import (
     ENV_GPU_CONFIG,
@@ -35,7 +40,9 @@ def setup_gpu_environment() -> None:
             )
 
 
-def allocate_gpu_memory(gpu_instance, logical_memory: int) -> None:
+def allocate_gpu_memory(
+    gpu_instance: "tf_config.PhysicalDevice", logical_memory: int
+) -> None:
 
     from tensorflow import config as tf_config
 
@@ -80,14 +87,17 @@ def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
     return parsed_gpu_config
 
 
-def setup_cpu_environment() -> Tuple[int, int]:
+def setup_cpu_environment() -> None:
     """Set configuration for the CPU environment based on the environment variable set"""
 
-    from tensorflow import config as tf_config
-
     inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG)
     intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG)
 
+    if not inter_op_parallel_threads and not intra_op_parallel_threads:
+        return
+
+    from tensorflow import config as tf_config
+
     if inter_op_parallel_threads:
 
         try:
@@ -112,12 +122,6 @@ def setup_cpu_environment() -> Tuple[int, int]:
 
         tf_config.threading.set_intra_op_parallelism_threads(intra_op_parallel_threads)
 
-    # Returning the actual values as a confirmation. Helps with tests too.
-    return (
-        tf_config.threading.get_inter_op_parallelism_threads(),
-        tf_config.threading.get_intra_op_parallelism_threads(),
-    )
-
 
 def setup_tf_environment() -> None:
 

From 6146bf96589537a294b996a92d496ff6ddc59b61 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 13:44:38 +0100
Subject: [PATCH 405/633] Apply suggestions from code review

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 changelog/5230.feature.rst           |  5 ++---
 docs/api/tensorflow_usage.rst        | 10 +++++-----
 rasa/utils/tensorflow/environment.py | 10 +++-------
 tests/utils/test_tf_environment.py   |  1 -
 4 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/changelog/5230.feature.rst b/changelog/5230.feature.rst
index c22e6f5f161a..5aeae8295c2e 100644
--- a/changelog/5230.feature.rst
+++ b/changelog/5230.feature.rst
@@ -8,9 +8,8 @@ which environment variables to set in what scenarios. A couple of examples are s
     # This specifies to use 1024 MB of memory from GPU with logical ID 0 and 2048 MB of memory from GPU with logical ID 1
     TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"
 
-    # Specifies that atmost 3 CPU threads can be used to parallelize multiple non-blocking operations
+    # Specifies that at most 3 CPU threads can be used to parallelize multiple non-blocking operations
     TF_INTER_OP_PARALLELISM_THREADS="3"
 
-    # Specifies that atmost 2 CPU threads can be used to parallelize a particular operation.
+    # Specifies that at most 2 CPU threads can be used to parallelize a particular operation.
     TF_INTRA_OP_PARALLELISM_THREADS="2"
-
diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
index c2eddcfcf0a3..40d9076067c8 100644
--- a/docs/api/tensorflow_usage.rst
+++ b/docs/api/tensorflow_usage.rst
@@ -14,7 +14,7 @@ All configuration options are specified using environment variables as shown in
 Optimizing CPU Performance
 --------------------------
 
-Parallelizing one operation
+Parallelizing One Operation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
@@ -22,7 +22,7 @@ to parallelize the execution of one operation. If left unspecified, this value d
 pick an appropriate value depending on the system configuration.
 
 
-Parallelizing multiple operations
+Parallelizing Multiple Operations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
@@ -33,7 +33,7 @@ which means TensorFlow should pick an appropriate value depending on the system
 Optimizing GPU Performance
 --------------------------
 
-Limiting GPU memory growth
+Limiting GPU Memory Growth
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 TensorFlow by default blocks all the available GPU memory for the running process. This can be limiting if you are running
@@ -41,7 +41,7 @@ multiple TensorFlow processes and want to distribute memory across them. To prev
 set an environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
 
 
-Restricting absolute GPU memory available
+Restricting Absolute GPU Memory Available
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Often, a developer wants to limit the absolute amount of GPU memory that can be used by a process.
@@ -52,4 +52,4 @@ You can do so by setting an environment variable as ``TF_GPU_MEMORY_ALLOC="0:102
 
 Another scenario can be where you have access to 2 GPUs(``GPU:0`` and ``GPU:1``) but you would like to use only the second
 GPU.
-``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory availble from GPU 1.
+``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory available from GPU 1.
diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 52016da934fd..b49981f43dc4 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -17,7 +17,7 @@
 
 
 def setup_gpu_environment() -> None:
-    """Set configuration for a GPU environment based on the environment variable set"""
+    """Set configuration for TensorFlow GPU environment based on the environment variable set."""
 
     gpu_memory_config = os.getenv(ENV_GPU_CONFIG)
     if gpu_memory_config:
@@ -31,7 +31,6 @@ def setup_gpu_environment() -> None:
         # Logic taken from https://www.tensorflow.org/guide/gpu
         if physical_gpus:
             for gpu_id, gpu_id_memory in parsed_gpu_config.items():
-
                 allocate_gpu_memory(physical_gpus[gpu_id], gpu_id_memory)
 
         else:
@@ -81,7 +80,7 @@ def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
     except ValueError:
         # Add a helper explanation
         raise ValueError(
-            f"Error parsing GPU configuration. Please cross-check the format of '{ENV_GPU_CONFIG}'"
+            f"Error parsing GPU configuration. Please cross-check the format of '{ENV_GPU_CONFIG}'."
         )
 
     return parsed_gpu_config
@@ -99,19 +98,17 @@ def setup_cpu_environment() -> None:
     from tensorflow import config as tf_config
 
     if inter_op_parallel_threads:
-
         try:
             inter_op_parallel_threads = int(inter_op_parallel_threads.strip())
         except ValueError:
             raise ValueError(
                 f"Error parsing the environment variable '{ENV_CPU_INTER_OP_CONFIG}'. Please "
-                f"cross-check the value"
+                f"cross-check the value."
             )
 
         tf_config.threading.set_inter_op_parallelism_threads(inter_op_parallel_threads)
 
     if intra_op_parallel_threads:
-
         try:
             intra_op_parallel_threads = int(intra_op_parallel_threads.strip())
         except ValueError:
@@ -124,6 +121,5 @@ def setup_cpu_environment() -> None:
 
 
 def setup_tf_environment() -> None:
-
     setup_cpu_environment()
     setup_gpu_environment()
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/test_tf_environment.py
index f44eed243f7c..7366e66ac690 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/test_tf_environment.py
@@ -7,5 +7,4 @@
     [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],
 )
 def test_gpu_config_parser(gpu_config_string, parsed_gpu_config):
-
     assert parse_gpu_config(gpu_config_string) == parsed_gpu_config

From 99a3fe2d8df63a30d8d3dbef8976764110c7a348 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 13:46:09 +0100
Subject: [PATCH 406/633] Apply suggestions from code review

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 docs/api/tensorflow_usage.rst        | 3 ---
 rasa/utils/tensorflow/environment.py | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
index 40d9076067c8..ef6786b294b3 100644
--- a/docs/api/tensorflow_usage.rst
+++ b/docs/api/tensorflow_usage.rst
@@ -21,7 +21,6 @@ Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify ma
 to parallelize the execution of one operation. If left unspecified, this value defaults to ``0`` which means TensorFlow should
 pick an appropriate value depending on the system configuration.
 
-
 Parallelizing Multiple Operations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -29,7 +28,6 @@ Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify ma
 to parallelize the execution of multiple **non-blocking** operations. If left unspecified, this value defaults to ``0``
 which means TensorFlow should pick an appropriate value depending on the system configuration.
 
-
 Optimizing GPU Performance
 --------------------------
 
@@ -40,7 +38,6 @@ TensorFlow by default blocks all the available GPU memory for the running proces
 multiple TensorFlow processes and want to distribute memory across them. To prevent this,
 set an environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
 
-
 Restricting Absolute GPU Memory Available
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index b49981f43dc4..3de779030b65 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -114,7 +114,7 @@ def setup_cpu_environment() -> None:
         except ValueError:
             raise ValueError(
                 f"Error parsing the environment variable '{ENV_CPU_INTRA_OP_CONFIG}'. Please "
-                f"cross-check the value"
+                f"cross-check the value."
             )
 
         tf_config.threading.set_intra_op_parallelism_threads(intra_op_parallel_threads)

From 14b6ea57e783a165a9fe8b48b76bc17ffc57ea4c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 14:14:28 +0100
Subject: [PATCH 407/633] changes from code comments

---
 rasa/utils/tensorflow/environment.py          | 68 +++++++++++--------
 tests/utils/tensorflow/__init__.py            |  0
 .../{ => tensorflow}/test_tf_environment.py   |  4 +-
 3 files changed, 41 insertions(+), 31 deletions(-)
 create mode 100644 tests/utils/tensorflow/__init__.py
 rename tests/utils/{ => tensorflow}/test_tf_environment.py (62%)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 3de779030b65..b5371caa9d14 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -2,10 +2,6 @@
 import os
 from typing import Text, Dict
 import typing
-
-if typing.TYPE_CHECKING:
-    from tensorflow import config as tf_config
-
 import rasa.utils.common as rasa_utils
 from rasa.constants import (
     ENV_GPU_CONFIG,
@@ -13,35 +9,47 @@
     ENV_CPU_INTRA_OP_CONFIG,
 )
 
+if typing.TYPE_CHECKING:
+    from tensorflow import config as tf_config
+
 logger = logging.getLogger(__name__)
 
 
-def setup_gpu_environment() -> None:
+def _setup_gpu_environment() -> None:
     """Set configuration for TensorFlow GPU environment based on the environment variable set."""
 
     gpu_memory_config = os.getenv(ENV_GPU_CONFIG)
-    if gpu_memory_config:
 
-        # Import from tensorflow only if necessary(environment variable was set)
-        from tensorflow import config as tf_config
+    if not gpu_memory_config:
+        return
 
-        parsed_gpu_config = parse_gpu_config(gpu_memory_config)
-        physical_gpus = tf_config.list_physical_devices("GPU")
+    # Import from tensorflow only if necessary(environment variable was set)
+    from tensorflow import config as tf_config
 
-        # Logic taken from https://www.tensorflow.org/guide/gpu
-        if physical_gpus:
-            for gpu_id, gpu_id_memory in parsed_gpu_config.items():
-                allocate_gpu_memory(physical_gpus[gpu_id], gpu_id_memory)
+    parsed_gpu_config = _parse_gpu_config(gpu_memory_config)
+    physical_gpus = tf_config.list_physical_devices("GPU")
 
-        else:
-            rasa_utils.raise_warning(
-                f"You have an environment variable '{ENV_GPU_CONFIG}' set but no GPUs were detected to configure"
-            )
+    # Logic taken from https://www.tensorflow.org/guide/gpu
+    if physical_gpus:
+        for gpu_id, gpu_id_memory in parsed_gpu_config.items():
+            _allocate_gpu_memory(physical_gpus[gpu_id], gpu_id_memory)
+
+    else:
+        rasa_utils.raise_warning(
+            f"You have an environment variable '{ENV_GPU_CONFIG}' set but no GPUs were detected to configure."
+        )
 
 
-def allocate_gpu_memory(
+def _allocate_gpu_memory(
     gpu_instance: "tf_config.PhysicalDevice", logical_memory: int
 ) -> None:
+    """
+    Create a new logical device out of the received GPU instance with specified amount of logical memory.
+
+    Args:
+        gpu_instance: PhysicalDevice instance of a GPU device.
+        logical_memory: Absolute amount of memory to be allocated to the new logical device.
+    """
 
     from tensorflow import config as tf_config
 
@@ -56,15 +64,15 @@ def allocate_gpu_memory(
         )
 
     except RuntimeError:
-        # Add a helper explanation where the error comes from
+        # Helper explanation of where the error comes from
         raise RuntimeError(
             "Error while setting up tensorflow environment. "
-            "Virtual devices must be set before GPUs have been initialized"
+            "Virtual devices must be set before GPUs have been initialized."
         )
 
 
-def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
-    """Parse GPU configuration variable from a string to a dict"""
+def _parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
+    """Parse GPU configuration variable from a string to a dict."""
 
     # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
     # Parse it and store in a dictionary
@@ -78,16 +86,17 @@ def parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
 
             parsed_gpu_config[instance_gpu_id] = instance_gpu_mem
     except ValueError:
-        # Add a helper explanation
+        # Helper explanation of where the error comes from
         raise ValueError(
-            f"Error parsing GPU configuration. Please cross-check the format of '{ENV_GPU_CONFIG}'."
+            f"Error parsing GPU configuration. Please cross-check the format of '{ENV_GPU_CONFIG}' "
+            f"at https://rasa.com/docs/rasa/api/tensorflow_usage.html#restricting-absolute-gpu-memory-available ."
         )
 
     return parsed_gpu_config
 
 
-def setup_cpu_environment() -> None:
-    """Set configuration for the CPU environment based on the environment variable set"""
+def _setup_cpu_environment() -> None:
+    """Set configuration for the CPU environment based on the environment variable set."""
 
     inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG)
     intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG)
@@ -121,5 +130,6 @@ def setup_cpu_environment() -> None:
 
 
 def setup_tf_environment() -> None:
-    setup_cpu_environment()
-    setup_gpu_environment()
+    """Setup CPU and GPU related environment settings for TensorFlow."""
+    _setup_cpu_environment()
+    _setup_gpu_environment()
diff --git a/tests/utils/tensorflow/__init__.py b/tests/utils/tensorflow/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/utils/test_tf_environment.py b/tests/utils/tensorflow/test_tf_environment.py
similarity index 62%
rename from tests/utils/test_tf_environment.py
rename to tests/utils/tensorflow/test_tf_environment.py
index 7366e66ac690..3fbcd480ace6 100644
--- a/tests/utils/test_tf_environment.py
+++ b/tests/utils/tensorflow/test_tf_environment.py
@@ -1,5 +1,5 @@
 import pytest
-from rasa.utils.tensorflow.environment import parse_gpu_config
+from rasa.utils.tensorflow.environment import _parse_gpu_config
 
 
 @pytest.mark.parametrize(
@@ -7,4 +7,4 @@
     [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],
 )
 def test_gpu_config_parser(gpu_config_string, parsed_gpu_config):
-    assert parse_gpu_config(gpu_config_string) == parsed_gpu_config
+    assert _parse_gpu_config(gpu_config_string) == parsed_gpu_config

From 98d54a9300d6a3181d933fd7dfa46b33bdbccfae Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 14:19:51 +0100
Subject: [PATCH 408/633] add docstring

---
 rasa/utils/tensorflow/environment.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index b5371caa9d14..0fc52c07fdb7 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -43,8 +43,7 @@ def _setup_gpu_environment() -> None:
 def _allocate_gpu_memory(
     gpu_instance: "tf_config.PhysicalDevice", logical_memory: int
 ) -> None:
-    """
-    Create a new logical device out of the received GPU instance with specified amount of logical memory.
+    """Create a new logical device out of the received GPU instance with specified amount of logical memory.
 
     Args:
         gpu_instance: PhysicalDevice instance of a GPU device.
@@ -72,7 +71,14 @@ def _allocate_gpu_memory(
 
 
 def _parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
-    """Parse GPU configuration variable from a string to a dict."""
+    """Parse GPU configuration variable from a string to a dict.
+
+    Args:
+        gpu_memory_config: String containing the configuration for GPU usage.
+
+    Returns:
+        Parsed configuration as a dictionary with GPU IDs as keys and requested memory as the value.
+    """
 
     # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
     # Parse it and store in a dictionary

From 69bd8f2c10527e802861d185d26f5b300080939b Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 15:00:59 +0100
Subject: [PATCH 409/633] Update rasa/utils/tensorflow/environment.py

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/utils/tensorflow/environment.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 0fc52c07fdb7..2cfbdd70e8d1 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -137,5 +137,6 @@ def _setup_cpu_environment() -> None:
 
 def setup_tf_environment() -> None:
     """Setup CPU and GPU related environment settings for TensorFlow."""
+
     _setup_cpu_environment()
     _setup_gpu_environment()

From e32eb80f92199c140fda26c19fa611d66f735808 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 17 Feb 2020 15:51:10 +0100
Subject: [PATCH 410/633] fix types in utils/hugging_face

---
 .../transformers_pre_post_processors.py       | 59 +++++++++++--------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
index 6fe8a22488d3..9273bd5a2d7a 100644
--- a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -57,9 +57,10 @@ def xlm_tokens_pre_processor(token_ids: List[int]) -> List[int]:
 
 
 def bert_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from BERT by removing CLS and SEP embeddings and returning CLS
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from BERT by removing CLS and SEP embeddings and
+    returning CLS
 
     token embedding as sentence representation"""
 
@@ -70,9 +71,10 @@ def bert_embeddings_post_processor(
 
 
 def gpt_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from GPT models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from GPT models by taking a mean over sequence
+    embeddings and
 
     returning that as sentence representation"""
 
@@ -83,12 +85,13 @@ def gpt_embeddings_post_processor(
 
 
 def xlnet_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from XLNet models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from XLNet models by taking a mean over sequence
+    embeddings and
 
-    returning that as sentence representation. Remove last two time steps corresponding to special tokens from the
-    sequence embeddings."""
+    returning that as sentence representation. Remove last two time steps corresponding
+    to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[:-2]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
@@ -97,12 +100,13 @@ def xlnet_embeddings_post_processor(
 
 
 def roberta_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from Roberta models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from Roberta models by taking a mean over sequence
+    embeddings and
 
-    returning that as sentence representation. Remove first and last time steps corresponding to special tokens from the
-    sequence embeddings."""
+    returning that as sentence representation. Remove first and last time steps
+    corresponding to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
@@ -111,12 +115,13 @@ def roberta_embeddings_post_processor(
 
 
 def xlm_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from XLM models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from XLM models by taking a mean over sequence
+    embeddings and
 
-    returning that as sentence representation. Remove first and last time steps corresponding to special tokens from the
-    sequence embeddings."""
+    returning that as sentence representation. Remove first and last time steps
+    corresponding to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
@@ -125,28 +130,32 @@ def xlm_embeddings_post_processor(
 
 
 def bert_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token
+    into sub-tokens"""
 
     tokens = [string.replace("##", "") for string in token_strings]
     return [string for string in tokens if string]
 
 
 def openaigpt_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a
+    token into sub-tokens"""
 
     tokens = [string.replace("</w>", "") for string in token_strings]
     return [string for string in tokens if string]
 
 
 def gpt2_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token
+    into sub-tokens"""
 
     tokens = [string.replace("Ġ", "") for string in token_strings]
     return [string for string in tokens if string]
 
 
 def xlnet_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token
+    into sub-tokens"""
 
     tokens = [string.replace("▁", "") for string in token_strings]
     return [string for string in tokens if string]

From 2707f0f26330afe9f71b9e2399531d6f1c2f3dc0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 17 Feb 2020 16:03:23 +0100
Subject: [PATCH 411/633] keep old response selector

---
 rasa/nlu/registry.py                    |  19 +-
 rasa/nlu/selectors/diet_selector.py     | 427 ++++++++++++++++++++++++
 rasa/nlu/selectors/response_selector.py | 202 +----------
 3 files changed, 448 insertions(+), 200 deletions(-)
 create mode 100644 rasa/nlu/selectors/diet_selector.py

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 74190848b635..f9b3a2217f4d 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -33,6 +33,7 @@
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.selectors.response_selector import ResponseSelector
+from rasa.nlu.selectors.diet_selector import DIETSelector
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
@@ -92,6 +93,7 @@
     EmbeddingIntentClassifier,
     # selectors
     ResponseSelector,
+    DIETSelector,
 ]
 
 # Mapping from a components name to its class to allow name based lookup.
@@ -130,13 +132,7 @@
         {"name": "SpacyTokenizer"},
         {"name": "SpacyFeaturizer"},
         {"name": "RegexFeaturizer"},
-        {"name": "LexicalSyntacticFeaturizer"},
-        {
-            "name": "DIETClassifier",
-            INTENT_CLASSIFICATION: False,
-            ENTITY_RECOGNITION: True,
-            NUM_TRANSFORMER_LAYERS: 0,
-        },
+        {"name": "CRFEntityExtractor"},
         {"name": "EntitySynonymMapper"},
         {"name": "SklearnIntentClassifier"},
     ],
@@ -144,7 +140,8 @@
     "supervised_embeddings": [
         {"name": "WhitespaceTokenizer"},
         {"name": "RegexFeaturizer"},
-        {"name": "LexicalSyntacticFeaturizer"},
+        {"name": "CRFEntityExtractor"},
+        {"name": "EntitySynonymMapper"},
         {"name": "CountVectorsFeaturizer"},
         {
             "name": "CountVectorsFeaturizer",
@@ -152,14 +149,12 @@
             "min_ngram": 1,
             "max_ngram": 4,
         },
-        {"name": "DIETClassifier"},
-        {"name": "EntitySynonymMapper"},
+        {"name": "EmbeddingIntentClassifier"},
     ],
     "pretrained_embeddings_convert": [
         {"name": "ConveRTTokenizer"},
         {"name": "ConveRTFeaturizer"},
-        {"name": "LexicalSyntacticFeaturizer"},
-        {"name": "DIETClassifier"},
+        {"name": "EmbeddingIntentClassifier"},
     ],
 }
 
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
new file mode 100644
index 000000000000..fa3dab4b6f48
--- /dev/null
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -0,0 +1,427 @@
+import logging
+
+import numpy as np
+import tensorflow as tf
+
+from typing import Any, Dict, Optional, Text, Tuple, Union
+
+from rasa.nlu.training_data import TrainingData, Message
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
+from rasa.nlu.components import any_of
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIM,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROPRATE,
+    DROPRATE_ATTENTION,
+    NEG_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
+    SCALE_LOSS,
+    USE_MAX_SIM_NEG,
+    MU_NEG,
+    MU_POS,
+    EMBED_DIM,
+    BILOU_FLAG,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
+)
+from rasa.nlu.constants import (
+    RESPONSE,
+    RESPONSE_SELECTOR_PROPERTY_NAME,
+    DEFAULT_OPEN_UTTERANCE_TYPE,
+    DENSE_FEATURE_NAMES,
+    TEXT,
+    SPARSE_FEATURE_NAMES,
+)
+from rasa.utils.tensorflow.model_data import RasaModelData
+from rasa.utils.tensorflow.models import RasaModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class DIETSelector(DIETClassifier):
+    """Response selector using supervised embeddings.
+
+    The response selector embeds user inputs
+    and candidate response into the same space.
+    Supervised embeddings are trained by maximizing similarity between them.
+    It also provides rankings of the response that did not "win".
+
+    The supervised response selector needs to be preceded by
+    a featurizer in the pipeline.
+    This featurizer creates the features used for the embeddings.
+    It is recommended to use ``CountVectorsFeaturizer`` that
+    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
+
+    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
+    However, in this implementation the `mu` parameter is treated differently
+    and additional hidden layers are added together with dropout.
+    """
+
+    provides = [RESPONSE, "response_ranking"]
+
+    requires = [
+        any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT]),
+        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
+    ]
+
+    # default properties (DOC MARKER - don't remove)
+    defaults = {
+        # nn architecture
+        # sizes of hidden layers before the embedding layer
+        # for input words and responses
+        # the number of hidden layers is thus equal to the length of this list
+        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
+        # Whether to share the hidden layer weights between input words and intent labels
+        SHARE_HIDDEN_LAYERS: False,
+        # number of units in transformer
+        TRANSFORMER_SIZE: None,
+        # number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 0,
+        # number of attention heads in transformer
+        NUM_HEADS: 4,
+        # max sequence length if pos_encoding='emb'
+        MAX_SEQ_LENGTH: 256,
+        # training parameters
+        # initial and final batch sizes - batch size will be
+        # linearly increased for each epoch
+        BATCH_SIZES: [64, 256],
+        # how to create batches
+        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        # number of epochs
+        EPOCHS: 300,
+        # set random seed to any int to get reproducible results
+        RANDOM_SEED: None,
+        # optimizer
+        LEARNING_RATE: 0.001,
+        # embedding parameters
+        # default dense dimension used if no dense features are present
+        DENSE_DIM: {TEXT: 512, LABEL: 512},
+        # dimension size of embedding vectors
+        EMBED_DIM: 20,
+        # the type of the similarity
+        NUM_NEG: 20,
+        # flag if minimize only maximum similarity over incorrect actions
+        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
+        # the type of the loss function
+        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
+        # number of top responses to normalize scores for softmax loss_type
+        # set to 0 to turn off normalization
+        RANKING_LENGTH: 10,
+        # how similar the algorithm should try
+        # to make embedding vectors for correct intent labels
+        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        # maximum negative similarity for incorrect intent labels
+        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        # flag: if true, only minimize the maximum similarity for
+        # incorrect intent labels
+        USE_MAX_SIM_NEG: True,
+        # scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # regularization parameters
+        # the scale of L2 regularization
+        REGULARIZATION_CONSTANT: 0.002,
+        # the scale of how critical the algorithm should be of minimizing the
+        # maximum similarity between embeddings of different intent labels
+        NEG_MARGIN_SCALE: 0.8,
+        # dropout rate for rnn
+        DROPRATE: 0.2,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
+        # use a unidirectional or bidirectional encoder
+        UNIDIRECTIONAL_ENCODER: False,
+        # if true apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: False,
+        # visualization of accuracy
+        # how often to calculate training accuracy
+        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
+        # how many examples to use for calculation of training accuracy
+        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
+        # if true random tokens of the input message will be masked and the model
+        # should predict those tokens
+        MASKED_LM: False,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # selector config
+        # name of the intent for which this response selector is to be trained
+        "retrieval_intent": None,
+    }
+    # end default properties (DOC MARKER - don't remove)
+
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+    ) -> None:
+
+        component_config = component_config or {}
+
+        # the following properties cannot be adapted for the ResponseSelector
+        component_config[INTENT_CLASSIFICATION] = True
+        component_config[ENTITY_RECOGNITION] = False
+        component_config[BILOU_FLAG] = False
+
+        super().__init__(
+            component_config,
+            inverted_label_dict,
+            inverted_tag_dict,
+            model,
+            batch_tuple_sizes,
+        )
+
+    @property
+    def label_key(self) -> Text:
+        return "label_ids"
+
+    @staticmethod
+    def model_class():
+        return DIET2DIET
+
+    def _load_selector_params(self, config: Dict[Text, Any]) -> None:
+        self.retrieval_intent = config["retrieval_intent"]
+        if not self.retrieval_intent:
+            # retrieval intent was left to its default value
+            logger.info(
+                "Retrieval intent parameter was left to its default value. This "
+                "response selector will be trained on training examples combining "
+                "all retrieval intents."
+            )
+
+    def _check_config_parameters(self) -> None:
+        super()._check_config_parameters()
+        self._load_selector_params(self.component_config)
+
+    @staticmethod
+    def _set_message_property(
+        message: Message, prediction_dict: Dict[Text, Any], selector_key: Text
+    ) -> None:
+
+        message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
+        message_selector_properties[selector_key] = prediction_dict
+        message.set(
+            RESPONSE_SELECTOR_PROPERTY_NAME,
+            message_selector_properties,
+            add_to_output=True,
+        )
+
+    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
+        """Performs sanity checks on training data, extracts encodings for labels
+        and prepares data for training"""
+        if self.retrieval_intent:
+            training_data = training_data.filter_by_intent(self.retrieval_intent)
+
+        label_id_dict = self._create_label_id_dict(training_data, attribute=RESPONSE)
+        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
+
+        self._label_data = self._create_label_data(
+            training_data, label_id_dict, attribute=RESPONSE
+        )
+
+        model_data = self._create_model_data(
+            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE
+        )
+
+        self.check_input_dimension_consistency(model_data)
+
+        return model_data
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        """Return the most likely response and its similarity to the input."""
+
+        out = self._predict(message)
+        label, label_ranking = self._predict_label(out)
+
+        selector_key = (
+            self.retrieval_intent
+            if self.retrieval_intent
+            else DEFAULT_OPEN_UTTERANCE_TYPE
+        )
+
+        logger.debug(
+            f"Adding following selector key to message property: {selector_key}"
+        )
+
+        prediction_dict = {"response": label, "ranking": label_ranking}
+
+        self._set_message_property(message, prediction_dict, selector_key)
+
+
+class DIET2DIET(DIET):
+    def _check_data(self) -> None:
+        if "text_features" not in self.data_signature:
+            raise ValueError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if "label_features" not in self.data_signature:
+            raise ValueError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if (
+            self.config[SHARE_HIDDEN_LAYERS]
+            and self.data_signature["text_features"]
+            != self.data_signature["label_features"]
+        ):
+            raise ValueError(
+                "If hidden layer weights are shared, data signatures "
+                "for text_features and label_features must coincide."
+            )
+
+    def _create_metrics(self) -> None:
+        # self.metrics preserve order
+        # output losses first
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
+        # output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
+
+    def _update_metrics_to_log(self) -> None:
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+
+        self.metrics_to_log += ["r_loss", "r_acc"]
+
+    def _prepare_layers(self) -> None:
+        self.text_name = TEXT
+        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
+
+        self._prepare_sequence_layers(self.text_name)
+        self._prepare_sequence_layers(self.label_name)
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers(self.text_name)
+        self._prepare_label_classification_layers()
+
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label_ids = self.tf_label_data["label_ids"][0]
+
+        mask_label = self.tf_label_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            self.tf_label_data["label_features"], mask_label, self.label_name
+        )
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+
+        all_labels_embed = self._tf_layers["embed.label"](cls_label)
+
+        return all_label_ids, all_labels_embed
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data["text_features"],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
+        )
+
+        mask_label = tf_batch_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["label_features"], mask_label, self.label_name
+        )
+
+        losses = []
+
+        if self.config[MASKED_LM]:
+            loss, acc = self._mask_loss(
+                text_transformed,
+                text_in,
+                text_seq_ids,
+                lm_mask_bool_text,
+                self.text_name,
+            )
+
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
+            losses.append(loss)
+
+        # get _cls_ vector for label classification
+        cls_text = self._last_token(text_transformed, sequence_lengths_text)
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+        label_ids = tf_batch_data["label_ids"][0]
+
+        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
+        self.response_loss.update_state(loss)
+        self.response_acc.update_state(acc)
+        losses.append(loss)
+
+        return tf.math.add_n(losses)
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
+        )
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, self.text_name
+        )
+
+        out = {}
+
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels()
+
+        # get _cls_ vector for intent classification
+        cls = self._last_token(text_transformed, sequence_lengths_text)
+        cls_embed = self._tf_layers["embed.text"](cls)
+
+        sim_all = self._tf_layers["loss.label"].sim(
+            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
+        )
+        scores = self._tf_layers["loss.label"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+        out["i_scores"] = scores
+
+        return out
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index f5b0097435fe..84a1e0cdd337 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -1,21 +1,15 @@
 import logging
 
-import numpy as np
-import tensorflow as tf
-
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
+from typing import Any, Dict, Optional, Text
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
-    TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
-    NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -32,9 +26,7 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    UNIDIRECTIONAL_ENCODER,
     DROPRATE,
-    DROPRATE_ATTENTION,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -43,9 +35,6 @@
     MU_POS,
     EMBED_DIM,
     BILOU_FLAG,
-    KEY_RELATIVE_ATTENTION,
-    VALUE_RELATIVE_ATTENTION,
-    MAX_RELATIVE_POSITION,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -57,7 +46,8 @@
 )
 from rasa.utils.tensorflow.model_data import RasaModelData
 from rasa.utils.tensorflow.models import RasaModel
-
+from rasa.utils.common import raise_warning
+from rasa.constants import DOCS_BASE_URL
 
 logger = logging.getLogger(__name__)
 
@@ -95,16 +85,9 @@ class ResponseSelector(DIETClassifier):
         # for input words and responses
         # the number of hidden layers is thus equal to the length of this list
         HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
-        # Whether to share the hidden layer weights between input words and intent labels
+        # Whether to share the hidden layer weights between input words and intent
+        # labels
         SHARE_HIDDEN_LAYERS: False,
-        # number of units in transformer
-        TRANSFORMER_SIZE: None,
-        # number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 0,
-        # number of attention heads in transformer
-        NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -149,10 +132,6 @@ class ResponseSelector(DIETClassifier):
         NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
-        # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
-        # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
@@ -163,12 +142,6 @@ class ResponseSelector(DIETClassifier):
         # if true random tokens of the input message will be masked and the model
         # should predict those tokens
         MASKED_LM: False,
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
@@ -190,6 +163,8 @@ def __init__(
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[BILOU_FLAG] = False
+        component_config[MASKED_LM] = False
+        component_config[NUM_TRANSFORMER_LAYERS] = 0
 
         super().__init__(
             component_config,
@@ -199,14 +174,16 @@ def __init__(
             batch_tuple_sizes,
         )
 
+        raise_warning(
+            f"'ResponseSelector' is deprecated. Use 'DIETSelector' instead.",
+            category=FutureWarning,
+            docs=f"{DOCS_BASE_URL}/nlu/components/",
+        )
+
     @property
     def label_key(self) -> Text:
         return "label_ids"
 
-    @staticmethod
-    def model_class():
-        return DIET2DIET
-
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
         if not self.retrieval_intent:
@@ -274,154 +251,3 @@ def process(self, message: Message, **kwargs: Any) -> None:
         prediction_dict = {"response": label, "ranking": label_ranking}
 
         self._set_message_property(message, prediction_dict, selector_key)
-
-
-class DIET2DIET(DIET):
-    def _check_data(self) -> None:
-        if "text_features" not in self.data_signature:
-            raise ValueError(
-                f"No text features specified. "
-                f"Cannot train '{self.__class__.__name__}' model."
-            )
-        if "label_features" not in self.data_signature:
-            raise ValueError(
-                f"No label features specified. "
-                f"Cannot train '{self.__class__.__name__}' model."
-            )
-        if (
-            self.config[SHARE_HIDDEN_LAYERS]
-            and self.data_signature["text_features"]
-            != self.data_signature["label_features"]
-        ):
-            raise ValueError(
-                "If hidden layer weights are shared, data signatures "
-                "for text_features and label_features must coincide."
-            )
-
-    def _create_metrics(self) -> None:
-        # self.metrics preserve order
-        # output losses first
-        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
-        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
-        # output accuracies second
-        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
-        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
-
-    def _update_metrics_to_log(self) -> None:
-        if self.config[MASKED_LM]:
-            self.metrics_to_log += ["m_loss", "m_acc"]
-
-        self.metrics_to_log += ["r_loss", "r_acc"]
-
-    def _prepare_layers(self) -> None:
-        self.text_name = TEXT
-        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
-
-        self._prepare_sequence_layers(self.text_name)
-        self._prepare_sequence_layers(self.label_name)
-        if self.config[MASKED_LM]:
-            self._prepare_mask_lm_layers(self.text_name)
-        self._prepare_label_classification_layers()
-
-    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_label_ids = self.tf_label_data["label_ids"][0]
-
-        mask_label = self.tf_label_data["label_mask"][0]
-        sequence_lengths_label = self._get_sequence_lengths(mask_label)
-
-        label_transformed, _, _, _ = self._create_sequence(
-            self.tf_label_data["label_features"], mask_label, self.label_name
-        )
-        cls_label = self._last_token(label_transformed, sequence_lengths_label)
-
-        all_labels_embed = self._tf_layers["embed.label"](cls_label)
-
-        return all_label_ids, all_labels_embed
-
-    def batch_loss(
-        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
-    ) -> tf.Tensor:
-        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths_text = self._get_sequence_lengths(mask_text)
-
-        (
-            text_transformed,
-            text_in,
-            text_seq_ids,
-            lm_mask_bool_text,
-        ) = self._create_sequence(
-            tf_batch_data["text_features"],
-            mask_text,
-            self.text_name,
-            self.config[MASKED_LM],
-            sequence_ids=True,
-        )
-
-        mask_label = tf_batch_data["label_mask"][0]
-        sequence_lengths_label = self._get_sequence_lengths(mask_label)
-
-        label_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["label_features"], mask_label, self.label_name
-        )
-
-        losses = []
-
-        if self.config[MASKED_LM]:
-            loss, acc = self._mask_loss(
-                text_transformed,
-                text_in,
-                text_seq_ids,
-                lm_mask_bool_text,
-                self.text_name,
-            )
-
-            self.mask_loss.update_state(loss)
-            self.mask_acc.update_state(acc)
-            losses.append(loss)
-
-        # get _cls_ vector for label classification
-        cls_text = self._last_token(text_transformed, sequence_lengths_text)
-        cls_label = self._last_token(label_transformed, sequence_lengths_label)
-        label_ids = tf_batch_data["label_ids"][0]
-
-        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
-        self.response_loss.update_state(loss)
-        self.response_acc.update_state(acc)
-        losses.append(loss)
-
-        return tf.math.add_n(losses)
-
-    def batch_predict(
-        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
-    ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = self.batch_to_model_data_format(
-            batch_in, self.predict_data_signature
-        )
-
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths_text = self._get_sequence_lengths(mask_text)
-
-        text_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, self.text_name
-        )
-
-        out = {}
-
-        if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels()
-
-        # get _cls_ vector for intent classification
-        cls = self._last_token(text_transformed, sequence_lengths_text)
-        cls_embed = self._tf_layers["embed.text"](cls)
-
-        sim_all = self._tf_layers["loss.label"].sim(
-            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
-        )
-        scores = self._tf_layers["loss.label"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
-        out["i_scores"] = scores
-
-        return out

From 75951818e46f59cad3c0fc5bf2a21c2e2fe34512 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 17 Feb 2020 16:07:56 +0100
Subject: [PATCH 412/633] Use only CLS vector if no transformer should be used

---
 rasa/nlu/classifiers/diet_classifier.py | 20 ++++++++++++++------
 rasa/utils/train_utils.py               | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 88762df951ee..1fefbd6e29c3 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -337,9 +337,8 @@ def _check_labels_features_exist(
                 return False
         return True
 
-    @staticmethod
-    def _extract_and_add_features(
-        message: Message, attribute: Text
+    def _extract_features(
+        self, message: Message, attribute: Text
     ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]:
         sparse_features = None
         dense_features = None
@@ -357,6 +356,15 @@ def _extract_and_add_features(
                     f"don't coincide in '{message.text}' for attribute '{attribute}'."
                 )
 
+        if (
+            self.component_config[NUM_TRANSFORMER_LAYERS] == 0
+            and self.component_config[ENTITY_RECOGNITION] == False
+            and attribute != INTENT
+        ):
+            # Use only the CLS token vector as features
+            sparse_features = train_utils.sequence_to_sentence_features(sparse_features)
+            dense_features = train_utils.sequence_to_sentence_features(dense_features)
+
         return sparse_features, dense_features
 
     def check_input_dimension_consistency(self, model_data: RasaModelData):
@@ -379,7 +387,7 @@ def _extract_labels_precomputed_features(
         dense_features = []
 
         for e in label_examples:
-            _sparse, _dense = self._extract_and_add_features(e, attribute)
+            _sparse, _dense = self._extract_features(e, attribute)
             if _sparse is not None:
                 sparse_features.append(_sparse)
             if _dense is not None:
@@ -479,14 +487,14 @@ def _create_model_data(
 
         for e in training_data:
             if label_attribute is None or e.get(label_attribute):
-                _sparse, _dense = self._extract_and_add_features(e, TEXT)
+                _sparse, _dense = self._extract_features(e, TEXT)
                 if _sparse is not None:
                     X_sparse.append(_sparse)
                 if _dense is not None:
                     X_dense.append(_dense)
 
             if e.get(label_attribute):
-                _sparse, _dense = self._extract_and_add_features(e, label_attribute)
+                _sparse, _dense = self._extract_features(e, label_attribute)
                 if _sparse is not None:
                     Y_sparse.append(_sparse)
                 if _dense is not None:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 48e69dfd435a..47bab071effa 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,11 +1,13 @@
 import numpy as np
 import tensorflow as tf
 import logging
+import scipy.sparse
 from typing import Optional, Text, Dict, Any, Union, List
 from rasa.core.constants import DIALOGUE
 from rasa.nlu.constants import TEXT
 from rasa.nlu.tokenizers.tokenizer import Token
 
+
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -102,6 +104,22 @@ def align_tokens(
     return tokens_out
 
 
+def sequence_to_sentence_features(
+    features: Union[np.ndarray, scipy.sparse.spmatrix]
+) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
+    """Extract the CLS token vector as sentence features.
+    Features is a sequence. The last token is the CLS token. The feature vector of
+    this token contains the sentence features."""
+
+    if features is None:
+        return None
+
+    if isinstance(features, scipy.sparse.spmatrix):
+        return scipy.sparse.coo_matrix(features.tocsr()[-1])
+
+    return np.expand_dims(features[-1], axis=0)
+
+
 def _replace_deprecated_option(
     old_option: Text, new_option: Union[Text, List[Text]], config: Dict[Text, Any]
 ) -> Dict[Text, Any]:

From 6df1faecd3bcf1afe75d31b2b258c8d08477de76 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 17 Feb 2020 15:51:10 +0100
Subject: [PATCH 413/633] fix types in utils/hugging_face

---
 .../nlu/utils/hugging_face/hf_transformers.py | 14 ++---
 .../transformers_pre_post_processors.py       | 59 +++++++++++--------
 2 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 4094f5313f98..5c52c8b4cde0 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -112,8 +112,8 @@ def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
         return model_tokens_cleaners[self.model_name](token_strings)
 
     def _post_process_sequence_embeddings(
-        self, sequence_embeddings: np.array
-    ) -> Tuple[np.array, np.array]:
+        self, sequence_embeddings: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
 
         sentence_embeddings = []
         post_processed_sequence_embeddings = []
@@ -175,7 +175,7 @@ def _get_token_ids_for_batch(
         return batch_tokens, batch_token_ids
 
     @staticmethod
-    def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.array:
+    def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.ndarray:
 
         attention_mask = []
         max_seq_length = max(actual_sequence_lengths)
@@ -211,7 +211,7 @@ def _add_padding_to_batch(
 
     @staticmethod
     def _extract_nonpadded_embeddings(
-        embeddings: np.array, actual_sequence_lengths: List[int]
+        embeddings: np.ndarray, actual_sequence_lengths: List[int]
     ):
 
         nonpadded_sequence_embeddings = []
@@ -222,8 +222,8 @@ def _extract_nonpadded_embeddings(
         return np.array(nonpadded_sequence_embeddings)
 
     def _compute_batch_sequence_features(
-        self, batch_attention_mask: np.array, padded_token_ids: List[List[int]]
-    ) -> np.array:
+        self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]]
+    ) -> np.ndarray:
 
         model_outputs = self.model(
             np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
@@ -237,7 +237,7 @@ def _compute_batch_sequence_features(
 
     def _get_model_features_for_batch(
         self, batch_token_ids: List[List[int]]
-    ) -> Tuple[np.array, np.array]:
+    ) -> Tuple[np.ndarray, np.ndarray]:
 
         # Let's first add tokenizer specific special tokens to all examples
         batch_token_ids_augmented = self._add_lm_specific_special_tokens(
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
index 6fe8a22488d3..9273bd5a2d7a 100644
--- a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -57,9 +57,10 @@ def xlm_tokens_pre_processor(token_ids: List[int]) -> List[int]:
 
 
 def bert_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from BERT by removing CLS and SEP embeddings and returning CLS
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from BERT by removing CLS and SEP embeddings and
+    returning CLS
 
     token embedding as sentence representation"""
 
@@ -70,9 +71,10 @@ def bert_embeddings_post_processor(
 
 
 def gpt_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from GPT models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from GPT models by taking a mean over sequence
+    embeddings and
 
     returning that as sentence representation"""
 
@@ -83,12 +85,13 @@ def gpt_embeddings_post_processor(
 
 
 def xlnet_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from XLNet models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from XLNet models by taking a mean over sequence
+    embeddings and
 
-    returning that as sentence representation. Remove last two time steps corresponding to special tokens from the
-    sequence embeddings."""
+    returning that as sentence representation. Remove last two time steps corresponding
+    to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[:-2]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
@@ -97,12 +100,13 @@ def xlnet_embeddings_post_processor(
 
 
 def roberta_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from Roberta models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from Roberta models by taking a mean over sequence
+    embeddings and
 
-    returning that as sentence representation. Remove first and last time steps corresponding to special tokens from the
-    sequence embeddings."""
+    returning that as sentence representation. Remove first and last time steps
+    corresponding to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
@@ -111,12 +115,13 @@ def roberta_embeddings_post_processor(
 
 
 def xlm_embeddings_post_processor(
-    sequence_embeddings: np.array,
-) -> Tuple[np.array, np.array]:
-    """Post process embeddings from XLM models by taking a mean over sequence embeddings and
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from XLM models by taking a mean over sequence
+    embeddings and
 
-    returning that as sentence representation. Remove first and last time steps corresponding to special tokens from the
-    sequence embeddings."""
+    returning that as sentence representation. Remove first and last time steps
+    corresponding to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
@@ -125,28 +130,32 @@ def xlm_embeddings_post_processor(
 
 
 def bert_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token
+    into sub-tokens"""
 
     tokens = [string.replace("##", "") for string in token_strings]
     return [string for string in tokens if string]
 
 
 def openaigpt_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a
+    token into sub-tokens"""
 
     tokens = [string.replace("</w>", "") for string in token_strings]
     return [string for string in tokens if string]
 
 
 def gpt2_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token
+    into sub-tokens"""
 
     tokens = [string.replace("Ġ", "") for string in token_strings]
     return [string for string in tokens if string]
 
 
 def xlnet_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
-    """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token into sub-tokens"""
+    """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token
+    into sub-tokens"""
 
     tokens = [string.replace("▁", "") for string in token_strings]
     return [string for string in tokens if string]

From eb4fce6dcd7d4770676dedbb6e0cb3a308f87821 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 18:49:27 +0100
Subject: [PATCH 414/633] Apply suggestions from code review

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/utils/tensorflow/environment.py          | 2 +-
 tests/utils/tensorflow/test_tf_environment.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index 2cfbdd70e8d1..a3f86e0110bf 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -23,7 +23,7 @@ def _setup_gpu_environment() -> None:
     if not gpu_memory_config:
         return
 
-    # Import from tensorflow only if necessary(environment variable was set)
+    # Import from tensorflow only if necessary (environment variable was set)
     from tensorflow import config as tf_config
 
     parsed_gpu_config = _parse_gpu_config(gpu_memory_config)
diff --git a/tests/utils/tensorflow/test_tf_environment.py b/tests/utils/tensorflow/test_tf_environment.py
index 3fbcd480ace6..f8bd7d6916c2 100644
--- a/tests/utils/tensorflow/test_tf_environment.py
+++ b/tests/utils/tensorflow/test_tf_environment.py
@@ -6,5 +6,5 @@
     "gpu_config_string, parsed_gpu_config",
     [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],
 )
-def test_gpu_config_parser(gpu_config_string, parsed_gpu_config):
+def test_gpu_config_parser(gpu_config_string: Text, parsed_gpu_config: Dict[int, int]):
     assert _parse_gpu_config(gpu_config_string) == parsed_gpu_config

From 895035940e08c3956ee75cc9f6da0741975fb02a Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 18:53:05 +0100
Subject: [PATCH 415/633] shorten docstring

---
 rasa/utils/tensorflow/environment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
index a3f86e0110bf..cc8977f38e1b 100644
--- a/rasa/utils/tensorflow/environment.py
+++ b/rasa/utils/tensorflow/environment.py
@@ -43,7 +43,7 @@ def _setup_gpu_environment() -> None:
 def _allocate_gpu_memory(
     gpu_instance: "tf_config.PhysicalDevice", logical_memory: int
 ) -> None:
-    """Create a new logical device out of the received GPU instance with specified amount of logical memory.
+    """Create a new logical device for the requested amount of memory.
 
     Args:
         gpu_instance: PhysicalDevice instance of a GPU device.

From f168ce9e0b21fda0ccb62803c6ab7e397f2e5436 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 17 Feb 2020 21:41:14 +0100
Subject: [PATCH 416/633] fix import

---
 tests/utils/tensorflow/test_tf_environment.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/utils/tensorflow/test_tf_environment.py b/tests/utils/tensorflow/test_tf_environment.py
index f8bd7d6916c2..3f30570d7975 100644
--- a/tests/utils/tensorflow/test_tf_environment.py
+++ b/tests/utils/tensorflow/test_tf_environment.py
@@ -1,4 +1,5 @@
 import pytest
+from typing import Text, Dict
 from rasa.utils.tensorflow.environment import _parse_gpu_config
 
 

From 8ce158c8da4f287ed94269fdd039c66a3d10733d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 08:19:40 +0100
Subject: [PATCH 417/633] update tests

---
 tests/nlu/base/test_evaluation.py             | 44 ++++++++++---
 tests/nlu/base/test_synonyms.py               |  2 -
 tests/nlu/classifiers/test_diet_classifier.py |  2 -
 .../classifiers/test_keyword_classifier.py    |  1 -
 .../test_embedding_response_selector.py       | 25 --------
 tests/nlu/selectors/test_response_selector.py | 64 +++++++++++++++++++
 tests/nlu/training/test_train.py              | 13 +++-
 7 files changed, 108 insertions(+), 43 deletions(-)
 delete mode 100644 tests/nlu/selectors/test_embedding_response_selector.py
 create mode 100644 tests/nlu/selectors/test_response_selector.py

diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 42b75bf5e6b0..1c63a3c2f663 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -56,6 +56,9 @@
 # https://github.com/pytest-dev/pytest-asyncio/issues/68
 # this event_loop is used by pytest-asyncio, and redefining it
 # is currently the only way of changing the scope of this fixture
+from utils.tensorflow.constants import EPOCHS
+
+
 @pytest.yield_fixture(scope="session")
 def event_loop(request: Request) -> Iterator[asyncio.AbstractEventLoop]:
     loop = asyncio.get_event_loop_policy().new_event_loop()
@@ -275,12 +278,25 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
         data, os.path.join(unpacked_trained_moodbot_path, "nlu"), errors=False
     )
     assert result.get("intent_evaluation")
-    assert result.get("entity_evaluation").get("DIETClassifier")
+    assert result.get("entity_evaluation").get("CRFEntityExtractor")
 
 
 def test_run_cv_evaluation():
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    nlu_config = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
+    nlu_config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer"},
+                {"name": "RegexFeaturizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 3},
+                {"name": "EntitySynonymMapper"},
+                {"name": "SklearnIntentClassifier"},
+            ],
+        }
+    )
 
     n_folds = 2
     intent_results, entity_results, response_selection_results = cross_validate(
@@ -293,12 +309,12 @@ def test_run_cv_evaluation():
     assert len(intent_results.test["Accuracy"]) == n_folds
     assert len(intent_results.test["Precision"]) == n_folds
     assert len(intent_results.test["F1-score"]) == n_folds
-    assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds
-    assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds
-    assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds
-    assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds
-    assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds
-    assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
+    assert len(entity_results.train["CRFEntityExtractor"]["Accuracy"]) == n_folds
+    assert len(entity_results.train["CRFEntityExtractor"]["Precision"]) == n_folds
+    assert len(entity_results.train["CRFEntityExtractor"]["F1-score"]) == n_folds
+    assert len(entity_results.test["CRFEntityExtractor"]["Accuracy"]) == n_folds
+    assert len(entity_results.test["CRFEntityExtractor"]["Precision"]) == n_folds
+    assert len(entity_results.test["CRFEntityExtractor"]["F1-score"]) == n_folds
 
 
 def test_run_cv_evaluation_with_response_selector():
@@ -309,8 +325,16 @@ def test_run_cv_evaluation_with_response_selector():
     training_data_obj = training_data_obj.merge(training_data_responses_obj)
     training_data_obj.fill_response_phrases()
 
-    nlu_config = config.load(
-        "sample_configs/config_embedding_intent_response_selector.yml"
+    nlu_config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "DIETClassifier", EPOCHS: 2},
+                {"name": "DIETSelector", EPOCHS: 2},
+            ],
+        }
     )
 
     n_folds = 2
diff --git a/tests/nlu/base/test_synonyms.py b/tests/nlu/base/test_synonyms.py
index c9ef7c7eb58c..8bb48e62a442 100644
--- a/tests/nlu/base/test_synonyms.py
+++ b/tests/nlu/base/test_synonyms.py
@@ -1,6 +1,4 @@
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
-from rasa.nlu.model import Metadata
-import pytest
 
 
 def test_entity_synonyms():
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index bb2d143d13fd..074072e5ccbf 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -3,8 +3,6 @@
 
 from unittest.mock import Mock
 
-import scipy
-
 from rasa.nlu import train
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.config import RasaNLUModelConfig
diff --git a/tests/nlu/classifiers/test_keyword_classifier.py b/tests/nlu/classifiers/test_keyword_classifier.py
index fc7b203a9e70..e5101c93f939 100644
--- a/tests/nlu/classifiers/test_keyword_classifier.py
+++ b/tests/nlu/classifiers/test_keyword_classifier.py
@@ -5,7 +5,6 @@
 
 # TODO: add tests for other classifers
 # from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
-# from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
 from rasa.nlu.training_data.formats.rasa import RasaReader
 from rasa.nlu.training_data import load_data
 from rasa.nlu.training_data.message import Message
diff --git a/tests/nlu/selectors/test_embedding_response_selector.py b/tests/nlu/selectors/test_embedding_response_selector.py
deleted file mode 100644
index 6b040ec8b74e..000000000000
--- a/tests/nlu/selectors/test_embedding_response_selector.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from rasa.nlu.training_data import load_data
-from rasa.nlu import config
-from rasa.nlu.train import Trainer, Interpreter
-
-
-def test_train_response_selector(component_builder, tmpdir):
-    td = load_data("data/examples/rasa/demo-rasa.md")
-    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
-    td = td.merge(td_responses)
-    td.fill_response_phrases()
-
-    nlu_config = config.load(
-        "sample_configs/config_embedding_intent_response_selector.yml"
-    )
-
-    trainer = Trainer(nlu_config)
-    trainer.train(td)
-
-    persisted_path = trainer.persist(tmpdir)
-
-    assert trainer.pipeline
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
diff --git a/tests/nlu/selectors/test_response_selector.py b/tests/nlu/selectors/test_response_selector.py
new file mode 100644
index 000000000000..831d412b0d0f
--- /dev/null
+++ b/tests/nlu/selectors/test_response_selector.py
@@ -0,0 +1,64 @@
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import load_data
+from rasa.nlu.train import Trainer, Interpreter
+from rasa.utils.tensorflow.constants import EPOCHS
+
+
+def test_train_response_selector(component_builder, tmpdir):
+    td = load_data("data/examples/rasa/demo-rasa.md")
+    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
+    td = td.merge(td_responses)
+    td.fill_response_phrases()
+
+    nlu_config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "DIETClassifier", EPOCHS: 2},
+                {"name": "ResponseSelector", EPOCHS: 2},
+            ],
+        }
+    )
+
+    trainer = Trainer(nlu_config)
+    trainer.train(td)
+
+    persisted_path = trainer.persist(tmpdir)
+
+    assert trainer.pipeline
+    loaded = Interpreter.load(persisted_path, component_builder)
+    assert loaded.pipeline
+    assert loaded.parse("hello") is not None
+    assert loaded.parse("Hello today is Monday, again!") is not None
+
+
+def test_train_diet_selector(component_builder, tmpdir):
+    td = load_data("data/examples/rasa/demo-rasa.md")
+    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
+    td = td.merge(td_responses)
+    td.fill_response_phrases()
+
+    nlu_config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "DIETClassifier", EPOCHS: 2},
+                {"name": "DIETSelector", EPOCHS: 2},
+            ],
+        }
+    )
+
+    trainer = Trainer(nlu_config)
+    trainer.train(td)
+
+    persisted_path = trainer.persist(tmpdir)
+
+    assert trainer.pipeline
+    loaded = Interpreter.load(persisted_path, component_builder)
+    assert loaded.pipeline
+    assert loaded.parse("hello") is not None
+    assert loaded.parse("Hello today is Monday, again!") is not None
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 7981da09c6b8..eb3e9443a96e 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -1,4 +1,5 @@
 import os
+
 import pytest
 
 from rasa.nlu import registry, train
@@ -12,7 +13,7 @@
 
 
 def as_pipeline(*components):
-    return [{"name": c, EPOCHS: 3} for c in components]
+    return [{"name": c, EPOCHS: 2} for c in components]
 
 
 def pipelines_for_tests():
@@ -47,6 +48,7 @@ def pipelines_for_tests():
                 "DIETClassifier",
                 "KeywordIntentClassifier",
                 "ResponseSelector",
+                "DIETSelector",
             ),
         ),
         (
@@ -124,8 +126,13 @@ async def test_random_seed(component_builder, tmpdir):
     """test if train result is the same for two runs of tf embedding"""
 
     _config = utilities.base_test_conf("supervised_embeddings")
-    # set fixed random seed of the DIET classifier to 1
-    _config.set_component_attr(5, random_seed=1)
+    # set fixed random seed
+    idx = _config.component_names.index("EmbeddingIntentClassifier")
+    _config.set_component_attr(idx, random_seed=1)
+    _config.set_component_attr(idx, epochs=1)
+    idx = _config.component_names.index("CRFEntityExtractor")
+    _config.set_component_attr(idx, random_seed=1)
+    _config.set_component_attr(idx, epochs=1)
 
     # first run
     (trained_a, _, persisted_path_a) = await train(

From eaafe2e26e9a81ed34f5f0b5150e4889081228ee Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 09:48:20 +0100
Subject: [PATCH 418/633] fix tests

---
 rasa/nlu/classifiers/diet_classifier.py      |  2 +-
 tests/core/test_restore.py                   |  6 ++++--
 tests/nlu/base/test_config.py                | 12 +++++++++---
 tests/nlu/base/test_evaluation.py            |  4 +---
 tests/nlu/extractors/test_entity_synonyms.py |  3 ++-
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 1fefbd6e29c3..7fc82630f6f8 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -358,7 +358,7 @@ def _extract_features(
 
         if (
             self.component_config[NUM_TRANSFORMER_LAYERS] == 0
-            and self.component_config[ENTITY_RECOGNITION] == False
+            and not self.component_config[ENTITY_RECOGNITION]
             and attribute != INTENT
         ):
             # Use only the CLS token vector as features
diff --git a/tests/core/test_restore.py b/tests/core/test_restore.py
index 5a1554e6f9ad..9d792da4b15e 100644
--- a/tests/core/test_restore.py
+++ b/tests/core/test_restore.py
@@ -17,9 +17,11 @@ async def test_restoring_tracker(trained_moodbot_path, recwarn):
 
     await restore.replay_events(tracker, agent)
 
-    # makes sure there are no warnings. warnings are raised, if the models
+    # makes sure there are no warnings.warnings are raised, if the models
     # predictions differ from the tracker when the dumped tracker is replayed
-    assert [e for e in recwarn if e._category_name == "UserWarning"] == []
+    # TODO tensorflow is printing a warning currently, should be resolved with an
+    #   upcoming version (https://github.com/tensorflow/tensorflow/issues/35100)
+    # assert [e for e in recwarn if e._category_name == "UserWarning"] == []
 
     assert len(tracker.events) == 7
     assert tracker.latest_action_name == "action_listen"
diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index 2daa0e3c9c17..4d70797f9b52 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -63,10 +63,16 @@ def test_default_config_file():
 
 def test_set_attr_on_component():
     cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
-    cfg.set_component_attr(7, C=324)
 
-    assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
-    assert cfg.for_component(7) == {"name": "SklearnIntentClassifier", "C": 324}
+    idx_classifier = cfg.component_names.index("SklearnIntentClassifier")
+    idx_tokenizer = cfg.component_names.index("SpacyTokenizer")
+    cfg.set_component_attr(idx_classifier, C=324)
+
+    assert cfg.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"}
+    assert cfg.for_component(idx_classifier) == {
+        "name": "SklearnIntentClassifier",
+        "C": 324,
+    }
 
 
 def test_override_defaults_supervised_embeddings_pipeline():
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 1c63a3c2f663..3678cff6275c 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -2,7 +2,6 @@
 from typing import Text, Iterator
 
 import asyncio
-import logging
 
 import pytest
 from _pytest.tmpdir import TempdirFactory
@@ -43,7 +42,6 @@
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu import utils
 import json
 import os
 from rasa.nlu import training_data, config
@@ -51,12 +49,12 @@
 from tests.nlu.conftest import DEFAULT_DATA_PATH, NLU_DEFAULT_CONFIG_PATH
 from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.test import is_response_selector_present
+from rasa.utils.tensorflow.constants import EPOCHS
 
 
 # https://github.com/pytest-dev/pytest-asyncio/issues/68
 # this event_loop is used by pytest-asyncio, and redefining it
 # is currently the only way of changing the scope of this fixture
-from utils.tensorflow.constants import EPOCHS
 
 
 @pytest.yield_fixture(scope="session")
diff --git a/tests/nlu/extractors/test_entity_synonyms.py b/tests/nlu/extractors/test_entity_synonyms.py
index eb9e5a332e2e..d24f4404e354 100644
--- a/tests/nlu/extractors/test_entity_synonyms.py
+++ b/tests/nlu/extractors/test_entity_synonyms.py
@@ -4,7 +4,8 @@
 
 def test_unintentional_synonyms_capitalized(component_builder):
     _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    ner_syn = component_builder.create_component(_config.for_component(6), _config)
+    idx = _config.component_names.index("EntitySynonymMapper")
+    ner_syn = component_builder.create_component(_config.for_component(idx), _config)
     examples = [
         Message(
             "Any Mexican restaurant will do",

From 70260b9c25ac551ce6a9d147d2e6349a3d75d46c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 11:50:59 +0100
Subject: [PATCH 419/633] disable cache

---
 .github/workflows/continous-integration.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/continous-integration.yml b/.github/workflows/continous-integration.yml
index 0b294e34b631..4284ac2f7115 100644
--- a/.github/workflows/continous-integration.yml
+++ b/.github/workflows/continous-integration.yml
@@ -49,12 +49,12 @@ jobs:
       with:
         python-version: 3.7
 
-    - name: Load Pip Cached Artifacts ⬇
-      uses: actions/cache@v1
-      with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-3.7-pip-${{ hashFiles('**/requirements-dev.txt') }}
-        restore-keys: ${{ runner.os }}-3.7-pip-
+#    - name: Load Pip Cached Artifacts ⬇
+#      uses: actions/cache@v1
+#      with:
+#        path: ~/.cache/pip
+#        key: ${{ runner.os }}-3.7-pip-${{ hashFiles('**/requirements-dev.txt') }}
+#        restore-keys: ${{ runner.os }}-3.7-pip-
 
     - name: Install Dependencies 📦
       run: |

From 5af493c72591c51ff986c823f95d628184f0dc89 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 12:38:52 +0100
Subject: [PATCH 420/633] undo disable cache

---
 .github/workflows/continous-integration.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/continous-integration.yml b/.github/workflows/continous-integration.yml
index 4284ac2f7115..0b294e34b631 100644
--- a/.github/workflows/continous-integration.yml
+++ b/.github/workflows/continous-integration.yml
@@ -49,12 +49,12 @@ jobs:
       with:
         python-version: 3.7
 
-#    - name: Load Pip Cached Artifacts ⬇
-#      uses: actions/cache@v1
-#      with:
-#        path: ~/.cache/pip
-#        key: ${{ runner.os }}-3.7-pip-${{ hashFiles('**/requirements-dev.txt') }}
-#        restore-keys: ${{ runner.os }}-3.7-pip-
+    - name: Load Pip Cached Artifacts ⬇
+      uses: actions/cache@v1
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-3.7-pip-${{ hashFiles('**/requirements-dev.txt') }}
+        restore-keys: ${{ runner.os }}-3.7-pip-
 
     - name: Install Dependencies 📦
       run: |

From fa7e8eb5e1042660feb7f667eaecf8f99457484d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 12:44:40 +0100
Subject: [PATCH 421/633] Add comment.

---
 rasa/nlu/classifiers/diet_classifier.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 7fc82630f6f8..72d554aa7965 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -356,12 +356,15 @@ def _extract_features(
                     f"don't coincide in '{message.text}' for attribute '{attribute}'."
                 )
 
+        # To speed up training take only the CLS token vector as feature if we don't
+        # use the transformer and we don't want to do entity recognition. We would
+        # not make use of the sequence anyway in this setup.  Carrying over
+        # those features to the actual training process takes quite some time.
         if (
             self.component_config[NUM_TRANSFORMER_LAYERS] == 0
             and not self.component_config[ENTITY_RECOGNITION]
             and attribute != INTENT
         ):
-            # Use only the CLS token vector as features
             sparse_features = train_utils.sequence_to_sentence_features(sparse_features)
             dense_features = train_utils.sequence_to_sentence_features(dense_features)
 

From f6e7b4bfc8ac63701d02f0d0164100a32b9f0b2b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 18 Feb 2020 14:06:26 +0100
Subject: [PATCH 422/633] fix docstrings

---
 rasa/nlu/components.py | 84 ++++++++++++++++++++++++++----------------
 rasa/nlu/model.py      |  9 ++---
 2 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 9e6ea0256bd5..2bfef406b050 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -14,8 +14,8 @@
 
 
 def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
-    """Tries to import all the package names and returns
-    the packages where it failed."""
+    """Tries to import all package names and returns the packages where it failed."""
+
     import importlib
 
     failed_imports = set()
@@ -28,8 +28,8 @@ def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
 
 
 def validate_requirements(component_names: List[Text]) -> None:
-    """Ensures that all required importable python packages are installed to
-    instantiate and used the passed components."""
+    """Ensures that all required importable python packages are installed."""
+
     from rasa.nlu import registry
 
     # Validate that all required packages are installed
@@ -56,8 +56,10 @@ def validate_arguments(
     context: Dict[Text, Any],
     allow_empty_pipeline: bool = False,
 ) -> None:
-    """Validates a pipeline before it is run. Ensures, that all
-    arguments are present to train the pipeline."""
+    """Validates a pipeline before it is run.
+
+    Ensures, that all arguments are present to train the pipeline.
+    """
 
     # Ensure the pipeline is not empty
     if not allow_empty_pipeline and len(pipeline) == 0:
@@ -86,10 +88,11 @@ def validate_arguments(
 
 
 def any_of(*args) -> Tuple[Any]:
-    """Helper function to define that one of the given arguments is required
-    by a component.
+    """Helper function to define that one of the given arguments is required.
+
+    Should be used inside `requires`.
+    """
 
-    Should be used inside `requires`."""
     return args
 
 
@@ -98,8 +101,7 @@ def validate_requires_any_of(
     provided_properties: Set[Text],
     component_name: Text,
 ) -> None:
-    """Validates that at least one of the given required properties is present in
-    the provided properties."""
+    """Validates that at least one of the given required properties is present."""
 
     property_present = any(
         [property in provided_properties for property in required_properties]
@@ -131,8 +133,7 @@ def validate_required_components_from_data(
 
 
 class MissingArgumentError(ValueError):
-    """Raised when a function is called and not all parameters can be
-    filled from the context / config.
+    """Raised when not all parameters can be filled from the context / config.
 
     Attributes:
         message -- explanation of which parameter is missing
@@ -167,7 +168,7 @@ def __str__(self) -> Text:
 
 
 class ComponentMetaclass(type):
-    """Metaclass with `name` class property"""
+    """Metaclass with `name` class property."""
 
     @property
     def name(cls):
@@ -195,7 +196,8 @@ class Component(metaclass=ComponentMetaclass):
     components a component can use to do its own
     processing. For example, a featurizer component can provide
     features that are used by another component down
-    the pipeline to do intent classification."""
+    the pipeline to do intent classification.
+    """
 
     # Component class name is used when integrating it in a
     # pipeline. E.g. ``[ComponentA, ComponentB]``
@@ -251,13 +253,16 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
     @classmethod
     def required_packages(cls) -> List[Text]:
-        """Specify which python packages need to be installed to use this
-        component, e.g. ``["spacy"]``. More specifically, these should be
+        """Specify which python packages need to be installed.
+
+        E.g. ``["spacy"]``. More specifically, these should be
         importable python package names e.g. `sklearn` and not package
         names in the dependencies sense e.g. `scikit-learn`
 
         This list of requirements allows us to fail early during training
-        if a required package is not installed."""
+        if a required package is not installed.
+        """
+
         return []
 
     @classmethod
@@ -276,8 +281,9 @@ def load(
         this component needs to be able to restore itself.
         Components can rely on any context attributes that are
         created by :meth:`components.Component.create`
-        calls to components previous
-        to this one."""
+        calls to components previous to this one.
+        """
+
         if cached_component:
             return cached_component
         else:
@@ -300,7 +306,7 @@ def create(
         return cls(component_config)
 
     def provide_context(self) -> Optional[Dict[Text, Any]]:
-        """Initialize this component for a new pipeline
+        """Initialize this component for a new pipeline.
 
         This function will be called before the training
         is started and before the first message is processed using
@@ -310,7 +316,9 @@ def provide_context(self) -> Optional[Dict[Text, Any]]:
         components do not need to implement this method.
         It's mostly used to initialize framework environments
         like MITIE and spacy
-        (e.g. loading word vectors for the pipeline)."""
+        (e.g. loading word vectors for the pipeline).
+        """
+
         pass
 
     def train(
@@ -328,7 +336,9 @@ def train(
         of ANY component and
         on any context attributes created by a call to
         :meth:`rasa.nlu.components.Component.train`
-        of components previous to this one."""
+        of components previous to this one.
+        """
+
         pass
 
     def process(self, message: Message, **kwargs: Any) -> None:
@@ -341,7 +351,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
         of ANY component and
         on any context attributes created by a call to
         :meth:`rasa.nlu.components.Component.process`
-        of components previous to this one."""
+        of components previous to this one.
+        """
+
         pass
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
@@ -358,7 +370,8 @@ def cache_key(
         If a component is unique to a model it should return None.
         Otherwise, an instantiation of the
         component will be reused for all models where the
-        metadata creates the same key."""
+        metadata creates the same key.
+        """
 
         return None
 
@@ -382,7 +395,8 @@ def prepare_partial_processing(
         The pipeline should be a list of components that are
         previous to this one in the pipeline and
         have already finished their training (and can therefore
-        be safely used to process messages)."""
+        be safely used to process messages).
+        """
 
         self.partial_processing_pipeline = pipeline
         self.partial_processing_context = context
@@ -392,7 +406,8 @@ def partially_process(self, message: Message) -> Message:
         training (e.g. external training data).
 
         The passed message will be processed by all components
-        previous to this one in the pipeline."""
+        previous to this one in the pipeline.
+        """
 
         if self.partial_processing_context is not None:
             for component in self.partial_processing_pipeline:
@@ -406,7 +421,8 @@ def can_handle_language(cls, language: Hashable) -> bool:
         """Check if component supports a specific language.
 
         This method can be overwritten when needed. (e.g. dynamically
-        determine which language is supported.)"""
+        determine which language is supported.)
+        """
 
         # if language_list is set to `None` it means: support all languages
         if language is None or cls.language_list is None:
@@ -466,7 +482,9 @@ def load_component(
         model_metadata: "Metadata",
         **context: Any,
     ) -> Component:
-        """Tries to retrieve a component from the cache, else calls
+        """Loads a component.
+
+        Tries to retrieve a component from the cache, else calls
         ``load`` to create a new component.
 
         Args:
@@ -504,8 +522,12 @@ def load_component(
     def create_component(
         self, component_config: Dict[Text, Any], cfg: RasaNLUModelConfig
     ) -> Component:
-        """Tries to retrieve a component from the cache,
-        calls `create` to create a new component."""
+        """Creates a component.
+
+        Tries to retrieve a component from the cache,
+        calls `create` to create a new component.
+        """
+
         from rasa.nlu import registry
         from rasa.nlu.model import Metadata
 
diff --git a/rasa/nlu/model.py b/rasa/nlu/model.py
index 6f96ce7f76bf..f353279cfa37 100644
--- a/rasa/nlu/model.py
+++ b/rasa/nlu/model.py
@@ -117,10 +117,8 @@ class Trainer:
     """Trainer will load the data and train all components.
 
     Requires a pipeline specification and configuration to use for
-    the training."""
-
-    # Officially supported languages (others might be used, but might fail)
-    SUPPORTED_LANGUAGES = ["de", "en"]
+    the training.
+    """
 
     def __init__(
         self,
@@ -150,7 +148,8 @@ def __init__(
     def _build_pipeline(
         cfg: RasaNLUModelConfig, component_builder: ComponentBuilder
     ) -> List[Component]:
-        """Transform the passed names of the pipeline components into classes"""
+        """Transform the passed names of the pipeline components into classes."""
+
         pipeline = []
 
         # Transform the passed names of the pipeline components into classes

From 27280b2afbf97946f4e6a359c29192ae88fff3f0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 14:33:19 +0100
Subject: [PATCH 423/633] update nlu tests

---
 tests/nlu/base/test_emulators.py              | 152 ------------------
 tests/nlu/conftest.py                         | 107 ++++++++----
 tests/nlu/{base => emulators}/__init__.py     |   0
 tests/nlu/emulators/test_dialogflow.py        |  40 +++++
 tests/nlu/emulators/test_luis.py              |  44 +++++
 tests/nlu/emulators/test_no_emulator.py       |  28 ++++
 tests/nlu/emulators/test_wit.py               |  34 ++++
 .../extractors/test_crf_entity_extractor.py   |  14 +-
 tests/nlu/extractors/test_entity_synonyms.py  |  21 ++-
 .../nlu/{base => extractors}/test_synonyms.py |   0
 tests/nlu/featurizers/test_lm_featurizer.py   |   9 +-
 tests/nlu/selectors/test_response_selector.py |  64 --------
 tests/nlu/selectors/test_selectors.py         |  41 +++++
 tests/nlu/{base => }/test_components.py       |   0
 tests/nlu/{base => }/test_config.py           |  58 ++++---
 tests/nlu/{base => }/test_evaluation.py       |  36 ++---
 tests/nlu/{base => }/test_interpreter.py      |   0
 tests/nlu/{base => }/test_persistor.py        |   0
 tests/nlu/{training => }/test_train.py        |  30 ++--
 tests/nlu/{base => }/test_utils.py            |   0
 .../tokenizers/test_whitespace_tokenizer.py   |   6 +-
 .../{training => training_data}/__init__.py   |   0
 .../test_training_data.py                     |   1 -
 tests/nlu/utilities.py                        |  10 --
 24 files changed, 355 insertions(+), 340 deletions(-)
 delete mode 100644 tests/nlu/base/test_emulators.py
 rename tests/nlu/{base => emulators}/__init__.py (100%)
 create mode 100644 tests/nlu/emulators/test_dialogflow.py
 create mode 100644 tests/nlu/emulators/test_luis.py
 create mode 100644 tests/nlu/emulators/test_no_emulator.py
 create mode 100644 tests/nlu/emulators/test_wit.py
 rename tests/nlu/{base => extractors}/test_synonyms.py (100%)
 delete mode 100644 tests/nlu/selectors/test_response_selector.py
 create mode 100644 tests/nlu/selectors/test_selectors.py
 rename tests/nlu/{base => }/test_components.py (100%)
 rename tests/nlu/{base => }/test_config.py (52%)
 rename tests/nlu/{base => }/test_evaluation.py (96%)
 rename tests/nlu/{base => }/test_interpreter.py (100%)
 rename tests/nlu/{base => }/test_persistor.py (100%)
 rename tests/nlu/{training => }/test_train.py (91%)
 rename tests/nlu/{base => }/test_utils.py (100%)
 rename tests/nlu/{training => training_data}/__init__.py (100%)
 rename tests/nlu/{base => training_data}/test_training_data.py (99%)

diff --git a/tests/nlu/base/test_emulators.py b/tests/nlu/base/test_emulators.py
deleted file mode 100644
index 2d21e966b909..000000000000
--- a/tests/nlu/base/test_emulators.py
+++ /dev/null
@@ -1,152 +0,0 @@
-def test_luis_request():
-    from rasa.nlu.emulators.luis import LUISEmulator
-
-    em = LUISEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-
-def test_luis_response():
-    from rasa.nlu.emulators.luis import LUISEmulator
-
-    em = LUISEmulator()
-    data = {
-        "text": "I want italian food",
-        "intent": {"name": "restaurant_search", "confidence": 0.737014589341683},
-        "intent_ranking": [
-            {"confidence": 0.737014589341683, "name": "restaurant_search"},
-            {"confidence": 0.11605464483122209, "name": "goodbye"},
-            {"confidence": 0.08816417744097163, "name": "greet"},
-            {"confidence": 0.058766588386123204, "name": "affirm"},
-        ],
-        "entities": [{"entity": "cuisine", "value": "italian"}],
-    }
-    norm = em.normalise_response_json(data)
-    assert norm == {
-        "query": data["text"],
-        "topScoringIntent": {"intent": "restaurant_search", "score": 0.737014589341683},
-        "intents": [
-            {"intent": "restaurant_search", "score": 0.737014589341683},
-            {"intent": "goodbye", "score": 0.11605464483122209},
-            {"intent": "greet", "score": 0.08816417744097163},
-            {"intent": "affirm", "score": 0.058766588386123204},
-        ],
-        "entities": [
-            {
-                "entity": e["value"],
-                "type": e["entity"],
-                "startIndex": None,
-                "endIndex": None,
-                "score": None,
-            }
-            for e in data["entities"]
-        ],
-    }
-
-
-def test_wit_request():
-    from rasa.nlu.emulators.wit import WitEmulator
-
-    em = WitEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-
-def test_wit_response():
-    from rasa.nlu.emulators.wit import WitEmulator
-
-    em = WitEmulator()
-    data = {
-        "text": "I want italian food",
-        "intent": {"name": "inform", "confidence": 0.4794813722432127},
-        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
-    }
-    norm = em.normalise_response_json(data)
-    assert norm == [
-        {
-            "entities": {
-                "cuisine": {
-                    "confidence": None,
-                    "type": "value",
-                    "value": "italian",
-                    "start": 7,
-                    "end": 14,
-                }
-            },
-            "intent": "inform",
-            "_text": "I want italian food",
-            "confidence": 0.4794813722432127,
-        }
-    ]
-
-
-def test_dialogflow_request():
-    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
-
-    em = DialogflowEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-
-def test_dialogflow_response():
-    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
-
-    em = DialogflowEmulator()
-    data = {
-        "text": "I want italian food",
-        "intent": {"name": "inform", "confidence": 0.4794813722432127},
-        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
-    }
-    norm = em.normalise_response_json(data)
-
-    assert norm == {
-        "id": norm["id"],
-        "result": {
-            "action": data["intent"]["name"],
-            "actionIncomplete": False,
-            "contexts": [],
-            "fulfillment": {},
-            "metadata": {
-                "intentId": norm["result"]["metadata"]["intentId"],
-                "intentName": data["intent"]["name"],
-                "webhookUsed": "false",
-            },
-            "parameters": {"cuisine": ["italian"]},
-            "resolvedQuery": data["text"],
-            "score": data["intent"]["confidence"],
-            "source": "agent",
-        },
-        "sessionId": norm["sessionId"],
-        "status": {"code": 200, "errorType": "success"},
-        "timestamp": norm["timestamp"],
-    }
-
-
-def test_dummy_request():
-    from rasa.nlu.emulators.no_emulator import NoEmulator
-
-    em = NoEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-    norm = em.normalise_request_json({"text": ["arb text"], "time": "1499279161658"})
-    assert norm == {"text": "arb text", "time": "1499279161658"}
-
-
-def test_dummy_response():
-    from rasa.nlu.emulators.no_emulator import NoEmulator
-
-    em = NoEmulator()
-    data = {"intent": "greet", "text": "hi", "entities": {}, "confidence": 1.0}
-    assert em.normalise_response_json(data) == data
-
-
-def test_emulators_can_handle_missing_data():
-    from rasa.nlu.emulators.luis import LUISEmulator
-
-    em = LUISEmulator()
-    norm = em.normalise_response_json(
-        {"text": "this data doesn't contain an intent result"}
-    )
-    assert norm["topScoringIntent"] is None
-    assert norm["intents"] == []
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 27ab3a5ccc13..d0de170f26f4 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -1,22 +1,14 @@
-import logging
-import os
+from typing import Text
 
 import pytest
 
-from rasa.nlu import config, train
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.components import ComponentBuilder
-from rasa.utils.tensorflow.constants import EPOCHS, RANDOM_SEED
-
-CONFIG_DEFAULTS_PATH = "sample_configs/config_defaults.yml"
-
-NLU_DEFAULT_CONFIG_PATH = "sample_configs/config_pretrained_embeddings_mitie.yml"
+from rasa.utils.tensorflow.constants import EPOCHS
+from tests.nlu.utilities import write_file_config
 
 DEFAULT_DATA_PATH = "data/examples/rasa/demo-rasa.json"
 
-NLU_MODEL_NAME = "nlu_model.tar.gz"
-
-MOODBOT_MODEL_PATH = "examples/moodbot/models/"
-
 
 @pytest.fixture(scope="session")
 def component_builder():
@@ -36,20 +28,9 @@ def spacy_nlp_component(component_builder, default_config):
 
 
 @pytest.fixture(scope="session")
-def ner_crf_pos_feature_config():
-    return {
-        "features": [
-            ["low", "title", "upper", "pos", "pos2"],
-            ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
-            ["low", "title", "upper", "pos", "pos2"],
-        ],
-        EPOCHS: 100,
-        RANDOM_SEED: 2020,
-    }
-
-
-@pytest.fixture(scope="session")
-def mitie_feature_extractor(component_builder, default_config):
+def mitie_feature_extractor(
+    component_builder: ComponentBuilder, default_config: RasaNLUModelConfig
+):
     mitie_nlp_config = {"name": "MitieNLP"}
     return component_builder.create_component(
         mitie_nlp_config, default_config
@@ -57,5 +38,75 @@ def mitie_feature_extractor(component_builder, default_config):
 
 
 @pytest.fixture(scope="session")
-def default_config():
-    return config.load(CONFIG_DEFAULTS_PATH)
+def default_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig({"language": "en", "pipeline": []})
+
+
+@pytest.fixture(scope="session")
+def config_path() -> Text:
+    return write_file_config(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 2},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 2},
+            ],
+        }
+    ).name
+
+
+@pytest.fixture(scope="session")
+def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer"},
+                {"name": "RegexFeaturizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 3},
+                {"name": "EntitySynonymMapper"},
+                {"name": "SklearnIntentClassifier"},
+            ],
+        }
+    )
+
+
+@pytest.fixture(scope="session")
+def supervised_embeddings_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "RegexFeaturizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 3},
+                {"name": "EntitySynonymMapper"},
+                {"name": "CountVectorsFeaturizer"},
+                {
+                    "name": "CountVectorsFeaturizer",
+                    "analyzer": "char_wb",
+                    "min_ngram": 1,
+                    "max_ngram": 4,
+                },
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 3},
+            ],
+        }
+    )
+
+
+@pytest.fixture(scope="session")
+def pretrained_embeddings_convert_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "ConveRTTokenizer"},
+                {"name": "ConveRTFeaturizer"},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 3},
+            ],
+        }
+    )
diff --git a/tests/nlu/base/__init__.py b/tests/nlu/emulators/__init__.py
similarity index 100%
rename from tests/nlu/base/__init__.py
rename to tests/nlu/emulators/__init__.py
diff --git a/tests/nlu/emulators/test_dialogflow.py b/tests/nlu/emulators/test_dialogflow.py
new file mode 100644
index 000000000000..76abc0ee0080
--- /dev/null
+++ b/tests/nlu/emulators/test_dialogflow.py
@@ -0,0 +1,40 @@
+def test_dialogflow_request():
+    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
+
+    em = DialogflowEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+
+def test_dialogflow_response():
+    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
+
+    em = DialogflowEmulator()
+    data = {
+        "text": "I want italian food",
+        "intent": {"name": "inform", "confidence": 0.4794813722432127},
+        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
+    }
+    norm = em.normalise_response_json(data)
+
+    assert norm == {
+        "id": norm["id"],
+        "result": {
+            "action": data["intent"]["name"],
+            "actionIncomplete": False,
+            "contexts": [],
+            "fulfillment": {},
+            "metadata": {
+                "intentId": norm["result"]["metadata"]["intentId"],
+                "intentName": data["intent"]["name"],
+                "webhookUsed": "false",
+            },
+            "parameters": {"cuisine": ["italian"]},
+            "resolvedQuery": data["text"],
+            "score": data["intent"]["confidence"],
+            "source": "agent",
+        },
+        "sessionId": norm["sessionId"],
+        "status": {"code": 200, "errorType": "success"},
+        "timestamp": norm["timestamp"],
+    }
diff --git a/tests/nlu/emulators/test_luis.py b/tests/nlu/emulators/test_luis.py
new file mode 100644
index 000000000000..5c2cad97e1ba
--- /dev/null
+++ b/tests/nlu/emulators/test_luis.py
@@ -0,0 +1,44 @@
+def test_luis_request():
+    from rasa.nlu.emulators.luis import LUISEmulator
+
+    em = LUISEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+
+def test_luis_response():
+    from rasa.nlu.emulators.luis import LUISEmulator
+
+    em = LUISEmulator()
+    data = {
+        "text": "I want italian food",
+        "intent": {"name": "restaurant_search", "confidence": 0.737014589341683},
+        "intent_ranking": [
+            {"confidence": 0.737014589341683, "name": "restaurant_search"},
+            {"confidence": 0.11605464483122209, "name": "goodbye"},
+            {"confidence": 0.08816417744097163, "name": "greet"},
+            {"confidence": 0.058766588386123204, "name": "affirm"},
+        ],
+        "entities": [{"entity": "cuisine", "value": "italian"}],
+    }
+    norm = em.normalise_response_json(data)
+    assert norm == {
+        "query": data["text"],
+        "topScoringIntent": {"intent": "restaurant_search", "score": 0.737014589341683},
+        "intents": [
+            {"intent": "restaurant_search", "score": 0.737014589341683},
+            {"intent": "goodbye", "score": 0.11605464483122209},
+            {"intent": "greet", "score": 0.08816417744097163},
+            {"intent": "affirm", "score": 0.058766588386123204},
+        ],
+        "entities": [
+            {
+                "entity": e["value"],
+                "type": e["entity"],
+                "startIndex": None,
+                "endIndex": None,
+                "score": None,
+            }
+            for e in data["entities"]
+        ],
+    }
diff --git a/tests/nlu/emulators/test_no_emulator.py b/tests/nlu/emulators/test_no_emulator.py
new file mode 100644
index 000000000000..cc40b3ae8390
--- /dev/null
+++ b/tests/nlu/emulators/test_no_emulator.py
@@ -0,0 +1,28 @@
+def test_dummy_request():
+    from rasa.nlu.emulators.no_emulator import NoEmulator
+
+    em = NoEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+    norm = em.normalise_request_json({"text": ["arb text"], "time": "1499279161658"})
+    assert norm == {"text": "arb text", "time": "1499279161658"}
+
+
+def test_dummy_response():
+    from rasa.nlu.emulators.no_emulator import NoEmulator
+
+    em = NoEmulator()
+    data = {"intent": "greet", "text": "hi", "entities": {}, "confidence": 1.0}
+    assert em.normalise_response_json(data) == data
+
+
+def test_emulators_can_handle_missing_data():
+    from rasa.nlu.emulators.luis import LUISEmulator
+
+    em = LUISEmulator()
+    norm = em.normalise_response_json(
+        {"text": "this data doesn't contain an intent result"}
+    )
+    assert norm["topScoringIntent"] is None
+    assert norm["intents"] == []
diff --git a/tests/nlu/emulators/test_wit.py b/tests/nlu/emulators/test_wit.py
new file mode 100644
index 000000000000..069caa26f27f
--- /dev/null
+++ b/tests/nlu/emulators/test_wit.py
@@ -0,0 +1,34 @@
+def test_wit_request():
+    from rasa.nlu.emulators.wit import WitEmulator
+
+    em = WitEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+
+def test_wit_response():
+    from rasa.nlu.emulators.wit import WitEmulator
+
+    em = WitEmulator()
+    data = {
+        "text": "I want italian food",
+        "intent": {"name": "inform", "confidence": 0.4794813722432127},
+        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
+    }
+    norm = em.normalise_response_json(data)
+    assert norm == [
+        {
+            "entities": {
+                "cuisine": {
+                    "confidence": None,
+                    "type": "value",
+                    "value": "italian",
+                    "start": 7,
+                    "end": 14,
+                }
+            },
+            "intent": "inform",
+            "_text": "I want italian food",
+            "confidence": 0.4794813722432127,
+        }
+    ]
diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index f90ac8cdf1d7..89624008f4d9 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -2,9 +2,10 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+from rasa.utils.tensorflow.constants import EPOCHS, RANDOM_SEED
 
 
-def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
+def test_crf_extractor(spacy_nlp):
     examples = [
         Message(
             "anywhere in the west",
@@ -41,7 +42,16 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
         ),
     ]
 
-    extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+    extractor = CRFEntityExtractor(
+        component_config={
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
+                ["low", "title", "upper", "pos", "pos2"],
+            ],
+            RANDOM_SEED: 1,
+        }
+    )
     tokenizer = WhitespaceTokenizer()
 
     training_data = TrainingData(training_examples=examples)
diff --git a/tests/nlu/extractors/test_entity_synonyms.py b/tests/nlu/extractors/test_entity_synonyms.py
index d24f4404e354..38df9fc9b81f 100644
--- a/tests/nlu/extractors/test_entity_synonyms.py
+++ b/tests/nlu/extractors/test_entity_synonyms.py
@@ -2,10 +2,17 @@
 from tests.nlu import utilities
 
 
-def test_unintentional_synonyms_capitalized(component_builder):
-    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    idx = _config.component_names.index("EntitySynonymMapper")
-    ner_syn = component_builder.create_component(_config.for_component(idx), _config)
+def test_unintentional_synonyms_capitalized(
+    component_builder, pretrained_embeddings_spacy_config
+):
+    idx = pretrained_embeddings_spacy_config.component_names.index(
+        "EntitySynonymMapper"
+    )
+    ner_syn = component_builder.create_component(
+        pretrained_embeddings_spacy_config.for_component(idx),
+        pretrained_embeddings_spacy_config,
+    )
+
     examples = [
         Message(
             "Any Mexican restaurant will do",
@@ -26,6 +33,10 @@ def test_unintentional_synonyms_capitalized(component_builder):
             },
         ),
     ]
-    ner_syn.train(TrainingData(training_examples=examples), _config)
+
+    ner_syn.train(
+        TrainingData(training_examples=examples), pretrained_embeddings_spacy_config
+    )
+
     assert ner_syn.synonyms.get("mexican") is None
     assert ner_syn.synonyms.get("tacos") == "Mexican"
diff --git a/tests/nlu/base/test_synonyms.py b/tests/nlu/extractors/test_synonyms.py
similarity index 100%
rename from tests/nlu/base/test_synonyms.py
rename to tests/nlu/extractors/test_synonyms.py
diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
index 119da8622d9c..f490781ad714 100644
--- a/tests/nlu/featurizers/test_lm_featurizer.py
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -4,14 +4,7 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
-from rasa.nlu.constants import (
-    TEXT,
-    DENSE_FEATURE_NAMES,
-    TOKENS_NAMES,
-    RESPONSE,
-    INTENT,
-    LANGUAGE_MODEL_DOCS,
-)
+from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, INTENT
 from rasa.nlu.training_data import Message
 
 
diff --git a/tests/nlu/selectors/test_response_selector.py b/tests/nlu/selectors/test_response_selector.py
deleted file mode 100644
index 831d412b0d0f..000000000000
--- a/tests/nlu/selectors/test_response_selector.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.training_data import load_data
-from rasa.nlu.train import Trainer, Interpreter
-from rasa.utils.tensorflow.constants import EPOCHS
-
-
-def test_train_response_selector(component_builder, tmpdir):
-    td = load_data("data/examples/rasa/demo-rasa.md")
-    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
-    td = td.merge(td_responses)
-    td.fill_response_phrases()
-
-    nlu_config = RasaNLUModelConfig(
-        {
-            "language": "en",
-            "pipeline": [
-                {"name": "WhitespaceTokenizer"},
-                {"name": "CountVectorsFeaturizer"},
-                {"name": "DIETClassifier", EPOCHS: 2},
-                {"name": "ResponseSelector", EPOCHS: 2},
-            ],
-        }
-    )
-
-    trainer = Trainer(nlu_config)
-    trainer.train(td)
-
-    persisted_path = trainer.persist(tmpdir)
-
-    assert trainer.pipeline
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
-
-
-def test_train_diet_selector(component_builder, tmpdir):
-    td = load_data("data/examples/rasa/demo-rasa.md")
-    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
-    td = td.merge(td_responses)
-    td.fill_response_phrases()
-
-    nlu_config = RasaNLUModelConfig(
-        {
-            "language": "en",
-            "pipeline": [
-                {"name": "WhitespaceTokenizer"},
-                {"name": "CountVectorsFeaturizer"},
-                {"name": "DIETClassifier", EPOCHS: 2},
-                {"name": "DIETSelector", EPOCHS: 2},
-            ],
-        }
-    )
-
-    trainer = Trainer(nlu_config)
-    trainer.train(td)
-
-    persisted_path = trainer.persist(tmpdir)
-
-    assert trainer.pipeline
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
new file mode 100644
index 000000000000..02fd54764ba3
--- /dev/null
+++ b/tests/nlu/selectors/test_selectors.py
@@ -0,0 +1,41 @@
+import pytest
+
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import load_data
+from rasa.nlu.train import Trainer, Interpreter
+from rasa.utils.tensorflow.constants import EPOCHS
+
+
+@pytest.mark.parametrize(
+    "pipeline",
+    [
+        [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "ResponseSelector", EPOCHS: 2},
+        ],
+        [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "DIETSelector", EPOCHS: 2},
+        ],
+    ],
+)
+def test_train_selector(pipeline, component_builder, tmpdir):
+    td = load_data("data/examples/rasa/demo-rasa.md")
+    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
+    td = td.merge(td_responses)
+    td.fill_response_phrases()
+
+    nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline})
+
+    trainer = Trainer(nlu_config)
+    trainer.train(td)
+
+    persisted_path = trainer.persist(tmpdir)
+
+    assert trainer.pipeline
+    loaded = Interpreter.load(persisted_path, component_builder)
+    assert loaded.pipeline
+    assert loaded.parse("hello") is not None
+    assert loaded.parse("Hello today is Monday, again!") is not None
diff --git a/tests/nlu/base/test_components.py b/tests/nlu/test_components.py
similarity index 100%
rename from tests/nlu/base/test_components.py
rename to tests/nlu/test_components.py
diff --git a/tests/nlu/base/test_config.py b/tests/nlu/test_config.py
similarity index 52%
rename from tests/nlu/base/test_config.py
rename to tests/nlu/test_config.py
index 4d70797f9b52..9fabad40b700 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/test_config.py
@@ -4,25 +4,17 @@
 
 import pytest
 
-import rasa.utils.io
 from rasa.nlu import config
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.registry import registered_pipeline_templates
-from tests.nlu.conftest import CONFIG_DEFAULTS_PATH
 from tests.nlu.utilities import write_file_config
 
-defaults = rasa.utils.io.read_config_file(CONFIG_DEFAULTS_PATH)
 
-
-def test_default_config(default_config):
-    assert default_config.as_dict() == defaults
-
-
-def test_blank_config():
+def test_blank_config(default_config):
     file_config = {}
     f = write_file_config(file_config)
     final_config = config.load(f.name)
-    assert final_config.as_dict() == defaults
+    assert final_config.as_dict() == default_config.as_dict()
 
 
 def test_invalid_config_json():
@@ -61,29 +53,43 @@ def test_default_config_file():
     assert len(final_config) > 1
 
 
-def test_set_attr_on_component():
-    cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
-
-    idx_classifier = cfg.component_names.index("SklearnIntentClassifier")
-    idx_tokenizer = cfg.component_names.index("SpacyTokenizer")
-    cfg.set_component_attr(idx_classifier, C=324)
+def test_set_attr_on_component(pretrained_embeddings_spacy_config):
+    idx_classifier = pretrained_embeddings_spacy_config.component_names.index(
+        "SklearnIntentClassifier"
+    )
+    idx_tokenizer = pretrained_embeddings_spacy_config.component_names.index(
+        "SpacyTokenizer"
+    )
+    pretrained_embeddings_spacy_config.set_component_attr(idx_classifier, C=324)
 
-    assert cfg.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"}
-    assert cfg.for_component(idx_classifier) == {
+    assert pretrained_embeddings_spacy_config.for_component(idx_tokenizer) == {
+        "name": "SpacyTokenizer"
+    }
+    assert pretrained_embeddings_spacy_config.for_component(idx_classifier) == {
         "name": "SklearnIntentClassifier",
         "C": 324,
     }
 
 
-def test_override_defaults_supervised_embeddings_pipeline():
-    cfg = config.load("data/test/config_embedding_test.yml")
+def test_override_defaults_supervised_embeddings_pipeline(supervised_embeddings_config):
     builder = ComponentBuilder()
 
-    component1_cfg = cfg.for_component(0)
+    idx_featurizer = supervised_embeddings_config.component_names.index(
+        "CountVectorsFeaturizer"
+    )
+    idx_classifier = supervised_embeddings_config.component_names.index(
+        "EmbeddingIntentClassifier"
+    )
+
+    config_featurizer = supervised_embeddings_config.for_component(idx_featurizer)
+    config_classifier = supervised_embeddings_config.for_component(idx_classifier)
 
-    component1 = builder.create_component(component1_cfg, cfg)
-    assert component1.max_ngram == 3
+    component1 = builder.create_component(
+        config_featurizer, supervised_embeddings_config
+    )
+    assert component1.max_ngram == 1
 
-    component2_cfg = cfg.for_component(1)
-    component2 = builder.create_component(component2_cfg, cfg)
-    assert component2.component_config["epochs"] == 10
+    component2 = builder.create_component(
+        config_classifier, supervised_embeddings_config
+    )
+    assert component2.component_config["epochs"] == 3
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/test_evaluation.py
similarity index 96%
rename from tests/nlu/base/test_evaluation.py
rename to tests/nlu/test_evaluation.py
index 3678cff6275c..d7251da69e6e 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -7,6 +7,7 @@
 from _pytest.tmpdir import TempdirFactory
 
 import rasa.utils.io
+from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 from rasa.test import compare_nlu_models
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
@@ -44,9 +45,9 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 import json
 import os
-from rasa.nlu import training_data, config
+from rasa.nlu import training_data
 from tests.nlu import utilities
-from tests.nlu.conftest import DEFAULT_DATA_PATH, NLU_DEFAULT_CONFIG_PATH
+from tests.nlu.conftest import DEFAULT_DATA_PATH
 from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.test import is_response_selector_present
 from rasa.utils.tensorflow.constants import EPOCHS
@@ -212,7 +213,7 @@ def test_determine_token_labels_throws_error():
         determine_token_labels(
             CH_correct_segmentation[0],
             [CH_correct_entity, CH_wrong_entity],
-            ["CRFEntityExtractor"],
+            set(CRFEntityExtractor.name),
         )
 
 
@@ -231,7 +232,7 @@ def test_determine_token_labels_with_extractors():
     determine_token_labels(
         CH_correct_segmentation[0],
         [CH_correct_entity, CH_wrong_entity],
-        [SpacyEntityExtractor.name, MitieEntityExtractor.name],
+        set(SpacyEntityExtractor.name, MitieEntityExtractor.name),
     )
 
 
@@ -279,26 +280,12 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
     assert result.get("entity_evaluation").get("CRFEntityExtractor")
 
 
-def test_run_cv_evaluation():
+def test_run_cv_evaluation(pretrained_embeddings_spacy_config):
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    nlu_config = RasaNLUModelConfig(
-        {
-            "language": "en",
-            "pipeline": [
-                {"name": "SpacyNLP"},
-                {"name": "SpacyTokenizer"},
-                {"name": "SpacyFeaturizer"},
-                {"name": "RegexFeaturizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 3},
-                {"name": "EntitySynonymMapper"},
-                {"name": "SklearnIntentClassifier"},
-            ],
-        }
-    )
 
     n_folds = 2
     intent_results, entity_results, response_selection_results = cross_validate(
-        td, n_folds, nlu_config
+        td, n_folds, pretrained_embeddings_spacy_config
     )
 
     assert len(intent_results.train["Accuracy"]) == n_folds
@@ -315,7 +302,7 @@ def test_run_cv_evaluation():
     assert len(entity_results.test["CRFEntityExtractor"]["F1-score"]) == n_folds
 
 
-def test_run_cv_evaluation_with_response_selector():
+def test_run_cv_evaluation_with_response_selector(supervised_embeddings_config):
     training_data_obj = training_data.load_data("data/examples/rasa/demo-rasa.md")
     training_data_responses_obj = training_data.load_data(
         "data/examples/rasa/demo-rasa-responses.md"
@@ -739,11 +726,8 @@ def test_get_evaluation_metrics(
     assert NO_ENTITY not in report
 
 
-def test_nlu_comparison(tmpdir):
-    configs = [
-        NLU_DEFAULT_CONFIG_PATH,
-        "sample_configs/config_supervised_embeddings.yml",
-    ]
+def test_nlu_comparison(tmpdir, config_path):
+    configs = [config_path]
     output = tmpdir.strpath
 
     compare_nlu_models(
diff --git a/tests/nlu/base/test_interpreter.py b/tests/nlu/test_interpreter.py
similarity index 100%
rename from tests/nlu/base/test_interpreter.py
rename to tests/nlu/test_interpreter.py
diff --git a/tests/nlu/base/test_persistor.py b/tests/nlu/test_persistor.py
similarity index 100%
rename from tests/nlu/base/test_persistor.py
rename to tests/nlu/test_persistor.py
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/test_train.py
similarity index 91%
rename from tests/nlu/training/test_train.py
rename to tests/nlu/test_train.py
index eb3e9443a96e..282a703755e2 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/test_train.py
@@ -122,28 +122,27 @@ async def test_train_model(pipeline_template, component_builder, tmpdir):
     assert loaded.parse("Hello today is Monday, again!") is not None
 
 
-async def test_random_seed(component_builder, tmpdir):
+async def test_random_seed(component_builder, tmpdir, supervised_embeddings_config):
     """test if train result is the same for two runs of tf embedding"""
 
-    _config = utilities.base_test_conf("supervised_embeddings")
     # set fixed random seed
-    idx = _config.component_names.index("EmbeddingIntentClassifier")
-    _config.set_component_attr(idx, random_seed=1)
-    _config.set_component_attr(idx, epochs=1)
-    idx = _config.component_names.index("CRFEntityExtractor")
-    _config.set_component_attr(idx, random_seed=1)
-    _config.set_component_attr(idx, epochs=1)
+    idx = supervised_embeddings_config.component_names.index(
+        "EmbeddingIntentClassifier"
+    )
+    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
+    idx = supervised_embeddings_config.component_names.index("CRFEntityExtractor")
+    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
 
     # first run
     (trained_a, _, persisted_path_a) = await train(
-        _config,
+        supervised_embeddings_config,
         path=tmpdir.strpath + "_a",
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
     # second run
     (trained_b, _, persisted_path_b) = await train(
-        _config,
+        supervised_embeddings_config,
         path=tmpdir.strpath + "_b",
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
@@ -212,12 +211,15 @@ async def test_train_named_model(component_builder, tmpdir):
     assert normalized_path == tmpdir.strpath
 
 
-async def test_handles_pipeline_with_non_existing_component(component_builder):
-    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    _config.pipeline.append({"name": "my_made_up_component"})
+async def test_handles_pipeline_with_non_existing_component(
+    component_builder, pretrained_embeddings_spacy_config
+):
+    pretrained_embeddings_spacy_config.pipeline.append({"name": "my_made_up_component"})
     with pytest.raises(Exception) as execinfo:
         await train(
-            _config, data=DEFAULT_DATA_PATH, component_builder=component_builder
+            pretrained_embeddings_spacy_config,
+            data=DEFAULT_DATA_PATH,
+            component_builder=component_builder,
         )
     assert "Cannot find class" in str(execinfo.value)
 
diff --git a/tests/nlu/base/test_utils.py b/tests/nlu/test_utils.py
similarity index 100%
rename from tests/nlu/base/test_utils.py
rename to tests/nlu/test_utils.py
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index df459bb33659..5cffefd2746f 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -80,9 +80,7 @@ def test_whitespace_with_case(text, component_config, expected_tokens):
     assert [t.text for t in tokens] == expected_tokens
 
 
-def test_whitespace_training():
-    _config = utilities.base_test_conf("supervised_embeddings")
-
+def test_whitespace_training(supervised_embeddings_config):
     examples = [
         Message(
             "Any Mexican restaurant will do",
@@ -107,7 +105,7 @@ def test_whitespace_training():
     component_config = {"case_sensitive": False}
     tk = WhitespaceTokenizer(component_config)
 
-    tk.train(TrainingData(training_examples=examples), _config)
+    tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)
 
     assert examples[0].data.get("tokens")[0].text == "any"
     assert examples[0].data.get("tokens")[1].text == "mexican"
diff --git a/tests/nlu/training/__init__.py b/tests/nlu/training_data/__init__.py
similarity index 100%
rename from tests/nlu/training/__init__.py
rename to tests/nlu/training_data/__init__.py
diff --git a/tests/nlu/base/test_training_data.py b/tests/nlu/training_data/test_training_data.py
similarity index 99%
rename from tests/nlu/base/test_training_data.py
rename to tests/nlu/training_data/test_training_data.py
index 6423816924db..c0f7c05c1e2f 100644
--- a/tests/nlu/base/test_training_data.py
+++ b/tests/nlu/training_data/test_training_data.py
@@ -1,4 +1,3 @@
-import logging
 from typing import Optional, Text
 
 import pytest
diff --git a/tests/nlu/utilities.py b/tests/nlu/utilities.py
index 6cf509ee435d..2f43cb55fcef 100644
--- a/tests/nlu/utilities.py
+++ b/tests/nlu/utilities.py
@@ -1,6 +1,5 @@
 import tempfile
 
-import pytest
 import ruamel.yaml as yaml
 
 from rasa.nlu.config import RasaNLUModelConfig
@@ -9,11 +8,6 @@
 
 
 def base_test_conf(pipeline_template):
-    # 'response_log': temp_log_file_dir(),
-    # 'port': 5022,
-    # "path": tempfile.mkdtemp(),
-    # "data": "./data/test/demo-rasa-small.json"
-
     return RasaNLUModelConfig({"pipeline": pipeline_template})
 
 
@@ -34,10 +28,6 @@ async def interpreter_for(component_builder, data, path, config):
     return interpreter
 
 
-def temp_log_file_dir():
-    return tempfile.mkdtemp(suffix="_rasa_nlu_logs")
-
-
 class ResponseTest:
     def __init__(self, endpoint, expected_response, payload=None):
         self.endpoint = endpoint

From 2ad0f436fe210dc82bc53ad988bb489c8d28e2b3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 15:41:03 +0100
Subject: [PATCH 424/633] add deprecation warnings

---
 rasa/core/policies/embedding_policy.py              |  3 ++-
 rasa/core/policies/keras_policy.py                  | 10 +++++++++-
 rasa/nlu/classifiers/embedding_intent_classifier.py |  7 ++++---
 rasa/nlu/classifiers/sklearn_intent_classifier.py   |  9 ++++++++-
 rasa/nlu/config.py                                  |  9 +++++++++
 rasa/nlu/extractors/crf_entity_extractor.py         |  8 +++++---
 rasa/nlu/selectors/response_selector.py             |  3 ++-
 7 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index e263553a1cc3..84bbd8572459 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -132,7 +132,8 @@ def __init__(
         super().__init__(featurizer, priority, max_history, model, **kwargs)
 
         raise_warning(
-            f"'EmbeddingPolicy' is deprecated. Use 'TEDPolicy' instead.",
+            f"'EmbeddingPolicy' is deprecated and will be removed in version 2.0. "
+            f"Use 'TEDPolicy' instead.",
             category=FutureWarning,
             docs=f"{DOCS_BASE_URL}/core/policies/",
         )
diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 8c8ee8879763..5d95b799e0af 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -14,10 +14,11 @@
     MaxHistoryTrackerFeaturizer,
     BinarySingleStateFeaturizer,
 )
+from rasa.constants import DOCS_BASE_URL
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.common import obtain_verbosity
+from rasa.utils.common import obtain_verbosity, raise_warning
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 
 # there are a number of issues with imports from tensorflow. hence the deactivation
@@ -70,6 +71,13 @@ def __init__(
 
         self.current_epoch = current_epoch
 
+        raise_warning(
+            "'KerasPolicy' is deprecated and will be removed in version "
+            "2.0. Use 'TEDPolicy' instead.",
+            category=FutureWarning,
+            docs=f"{DOCS_BASE_URL}/core/policies/",
+        )
+
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         config = copy.deepcopy(self.defaults)
         config.update(kwargs)
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 1828a65f5cc8..4eb10de4b860 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,7 +1,7 @@
 import logging
 from typing import Any, Dict, Optional, Text
 
-from rasa.constants import DOCS_BASE_URL
+from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES
@@ -137,7 +137,8 @@ def __init__(
         )
 
         raise_warning(
-            f"'EmbeddingIntentClassifier' is deprecated. Use 'DIETClassifier' instead.",
+            "'EmbeddingIntentClassifier' is deprecated and will be removed in version "
+            "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
-            docs=f"{DOCS_BASE_URL}/nlu/components/",
+            docs=DOCS_URL_COMPONENTS,
         )
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 82399446a76d..cfc976d0966a 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
+from rasa.constants import DOCS_URL_COMPONENTS, DOCS_URL_TRAINING_DATA_NLU
 from rasa.nlu import utils
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
@@ -63,6 +63,13 @@ def __init__(
             self.le = LabelEncoder()
         self.clf = clf
 
+        raise_warning(
+            "'SklearnIntentClassifier' is deprecated and will be removed in version "
+            "2.0. Use 'DIETClassifier' instead.",
+            category=FutureWarning,
+            docs=DOCS_URL_COMPONENTS,
+        )
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["sklearn"]
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index a39d5632466a..a8b1e4155903 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -118,6 +118,15 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
             pipeline = registry.pipeline_template(template_name)
 
             if pipeline:
+                raise_warning(
+                    "You are using a pipeline template. All pipelines templates "
+                    "are deprecated and will be removed in version 2.0. Please add "
+                    "the components you want to use directly to your configuration "
+                    "file.",
+                    FutureWarning,
+                    docs=DOCS_URL_PIPELINE,
+                )
+
                 # replaces the template with the actual components
                 self.__dict__["pipeline"] = pipeline
             else:
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 02e36c023b6d..8f4c0c1475ac 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -7,6 +7,7 @@
 from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
     LexicalSyntacticFeaturizer,
 )
+from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.constants import DOCS_BASE_URL
@@ -135,10 +136,11 @@ def __init__(
         )
 
         raise_warning(
-            f"'CRFEntityExtractor' is deprecated. Use 'DIETClassifier' in "
-            f"combination with 'LexicalSyntacticFeaturizer' instead.",
+            f"'CRFEntityExtractor' is deprecated and will be remove in version 2.0. "
+            f"Use 'DIETClassifier' in combination with 'LexicalSyntacticFeaturizer' "
+            f"instead.",
             category=FutureWarning,
-            docs=f"{DOCS_BASE_URL}/nlu/components/",
+            docs=DOCS_URL_COMPONENTS,
         )
 
     def train(
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 84a1e0cdd337..88c5f1cd5bf1 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -175,7 +175,8 @@ def __init__(
         )
 
         raise_warning(
-            f"'ResponseSelector' is deprecated. Use 'DIETSelector' instead.",
+            f"'ResponseSelector' is deprecated and will be removed in version 2.0. "
+            f"Use 'DIETSelector' instead.",
             category=FutureWarning,
             docs=f"{DOCS_BASE_URL}/nlu/components/",
         )

From a067a4c1a161d60eb6b812f1e33a759e893e55cd Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 15:47:46 +0100
Subject: [PATCH 425/633] update default config

---
 rasa/cli/initial_project/config.yml | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/rasa/cli/initial_project/config.yml b/rasa/cli/initial_project/config.yml
index 116158293b14..8e95024ee7af 100644
--- a/rasa/cli/initial_project/config.yml
+++ b/rasa/cli/initial_project/config.yml
@@ -1,7 +1,18 @@
 # Configuration for Rasa NLU.
 # https://rasa.com/docs/rasa/nlu/components/
 language: en
-pipeline: supervised_embeddings
+pipeline:
+  - name: WhitespaceTokenizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
 
 # Configuration for Rasa Core.
 # https://rasa.com/docs/rasa/core/policies/

From 71b53679cefe99f71c178d16135a1450eaa9a326 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 15:51:47 +0100
Subject: [PATCH 426/633] move sample_configs to data/test_config

---
 .../test_config}/config_crf_custom_features.yml  |  0
 .../test_config}/config_defaults.yml             |  0
 ...config_embedding_intent_response_selector.yml |  0
 .../config_pretrained_embeddings_convert.yml     |  0
 .../config_pretrained_embeddings_mitie.yml       |  0
 .../config_pretrained_embeddings_mitie_2.yml     |  0
 .../config_pretrained_embeddings_mitie_zh.yml    |  0
 .../config_pretrained_embeddings_spacy.yml       |  0
 .../config_pretrained_embeddings_spacy_de.yml    |  0
 ...nfig_pretrained_embeddings_spacy_duckling.yml |  0
 .../config_supervised_embeddings.yml             |  0
 .../config_supervised_embeddings_duckling.yml    |  0
 .../test_config}/config_train_server_json.yml    |  0
 .../test_config}/config_train_server_md.yml      |  0
 docker/Dockerfile_full                           |  2 +-
 docker/Dockerfile_pretrained_embeddings_mitie_en |  2 +-
 docker/Dockerfile_pretrained_embeddings_spacy_de |  2 +-
 docker/Dockerfile_pretrained_embeddings_spacy_en |  2 +-
 docs/nlu/choosing-a-pipeline.rst                 | 16 ++++++++--------
 tests/utils/test_validation.py                   |  6 +++---
 20 files changed, 15 insertions(+), 15 deletions(-)
 rename {sample_configs => data/test_config}/config_crf_custom_features.yml (100%)
 rename {sample_configs => data/test_config}/config_defaults.yml (100%)
 rename {sample_configs => data/test_config}/config_embedding_intent_response_selector.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_convert.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_mitie.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_mitie_2.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_mitie_zh.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_spacy.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_spacy_de.yml (100%)
 rename {sample_configs => data/test_config}/config_pretrained_embeddings_spacy_duckling.yml (100%)
 rename {sample_configs => data/test_config}/config_supervised_embeddings.yml (100%)
 rename {sample_configs => data/test_config}/config_supervised_embeddings_duckling.yml (100%)
 rename {sample_configs => data/test_config}/config_train_server_json.yml (100%)
 rename {sample_configs => data/test_config}/config_train_server_md.yml (100%)

diff --git a/sample_configs/config_crf_custom_features.yml b/data/test_config/config_crf_custom_features.yml
similarity index 100%
rename from sample_configs/config_crf_custom_features.yml
rename to data/test_config/config_crf_custom_features.yml
diff --git a/sample_configs/config_defaults.yml b/data/test_config/config_defaults.yml
similarity index 100%
rename from sample_configs/config_defaults.yml
rename to data/test_config/config_defaults.yml
diff --git a/sample_configs/config_embedding_intent_response_selector.yml b/data/test_config/config_embedding_intent_response_selector.yml
similarity index 100%
rename from sample_configs/config_embedding_intent_response_selector.yml
rename to data/test_config/config_embedding_intent_response_selector.yml
diff --git a/sample_configs/config_pretrained_embeddings_convert.yml b/data/test_config/config_pretrained_embeddings_convert.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_convert.yml
rename to data/test_config/config_pretrained_embeddings_convert.yml
diff --git a/sample_configs/config_pretrained_embeddings_mitie.yml b/data/test_config/config_pretrained_embeddings_mitie.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_mitie.yml
rename to data/test_config/config_pretrained_embeddings_mitie.yml
diff --git a/sample_configs/config_pretrained_embeddings_mitie_2.yml b/data/test_config/config_pretrained_embeddings_mitie_2.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_mitie_2.yml
rename to data/test_config/config_pretrained_embeddings_mitie_2.yml
diff --git a/sample_configs/config_pretrained_embeddings_mitie_zh.yml b/data/test_config/config_pretrained_embeddings_mitie_zh.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_mitie_zh.yml
rename to data/test_config/config_pretrained_embeddings_mitie_zh.yml
diff --git a/sample_configs/config_pretrained_embeddings_spacy.yml b/data/test_config/config_pretrained_embeddings_spacy.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_spacy.yml
rename to data/test_config/config_pretrained_embeddings_spacy.yml
diff --git a/sample_configs/config_pretrained_embeddings_spacy_de.yml b/data/test_config/config_pretrained_embeddings_spacy_de.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_spacy_de.yml
rename to data/test_config/config_pretrained_embeddings_spacy_de.yml
diff --git a/sample_configs/config_pretrained_embeddings_spacy_duckling.yml b/data/test_config/config_pretrained_embeddings_spacy_duckling.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_spacy_duckling.yml
rename to data/test_config/config_pretrained_embeddings_spacy_duckling.yml
diff --git a/sample_configs/config_supervised_embeddings.yml b/data/test_config/config_supervised_embeddings.yml
similarity index 100%
rename from sample_configs/config_supervised_embeddings.yml
rename to data/test_config/config_supervised_embeddings.yml
diff --git a/sample_configs/config_supervised_embeddings_duckling.yml b/data/test_config/config_supervised_embeddings_duckling.yml
similarity index 100%
rename from sample_configs/config_supervised_embeddings_duckling.yml
rename to data/test_config/config_supervised_embeddings_duckling.yml
diff --git a/sample_configs/config_train_server_json.yml b/data/test_config/config_train_server_json.yml
similarity index 100%
rename from sample_configs/config_train_server_json.yml
rename to data/test_config/config_train_server_json.yml
diff --git a/sample_configs/config_train_server_md.yml b/data/test_config/config_train_server_md.yml
similarity index 100%
rename from sample_configs/config_train_server_md.yml
rename to data/test_config/config_train_server_md.yml
diff --git a/docker/Dockerfile_full b/docker/Dockerfile_full
index 94086c350fd4..3209ad3bf4a8 100644
--- a/docker/Dockerfile_full
+++ b/docker/Dockerfile_full
@@ -72,7 +72,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_spacy_duckling.yml config.yml
+COPY data/test_config/config_pretrained_embeddings_spacy_duckling.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_mitie_en
index 7f0737e9404b..8e7acdff149f 100644
--- a/docker/Dockerfile_pretrained_embeddings_mitie_en
+++ b/docker/Dockerfile_pretrained_embeddings_mitie_en
@@ -66,7 +66,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_mitie.yml config.yml
+COPY data/test_config/config_pretrained_embeddings_mitie.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_de b/docker/Dockerfile_pretrained_embeddings_spacy_de
index a95318485c66..68975812e80c 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_de
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_de
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_spacy_de.yml config.yml
+COPY data/test_config/config_pretrained_embeddings_spacy_de.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_en b/docker/Dockerfile_pretrained_embeddings_spacy_en
index 289c20053349..96976d18e8f9 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_en
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_en
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_spacy.yml config.yml
+COPY data/test_config/config_pretrained_embeddings_spacy.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index ff3d374d43b1..f51081b002c4 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -21,13 +21,13 @@ The Short Answer
 
 If your training data is in english, a good starting point is using ``pretrained_embeddings_convert`` pipeline.
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_convert.yml
+.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_convert.yml
     :language: yaml
 
 In case your training data is multi-lingual and is rich with domain specific vocabulary,
 use the ``supervised_embeddings`` pipeline:
 
-.. literalinclude:: ../../sample_configs/config_supervised_embeddings.yml
+.. literalinclude:: ../../data/test_config/config_supervised_embeddings.yml
     :language: yaml
 
 
@@ -246,7 +246,7 @@ Pre-configured Pipelines
 A template is just a shortcut for
 a full list of components. For example, these two configurations are equivalent:
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_spacy.yml
+.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_spacy.yml
     :language: yaml
 
 .. code-block:: yaml
@@ -272,7 +272,7 @@ supervised_embeddings
 To train a Rasa model in your preferred language, define the
 ``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
 
-.. literalinclude:: ../../sample_configs/config_supervised_embeddings.yml
+.. literalinclude:: ../../data/test_config/config_supervised_embeddings.yml
     :language: yaml
 
 The ``supervised_embeddings`` pipeline supports any language that can be tokenized.  By default it uses whitespace
@@ -312,7 +312,7 @@ pretrained_embeddings_convert
 
 To use the ``pretrained_embeddings_convert`` template:
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_convert.yml
+.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_convert.yml
     :language: yaml
 
 To use the components and configure them separately:
@@ -333,7 +333,7 @@ pretrained_embeddings_spacy
 
 To use the ``pretrained_embeddings_spacy`` template:
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_spacy.yml
+.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_spacy.yml
     :language: yaml
 
 See :ref:`pretrained-word-vectors` for more information about loading spacy language models.
@@ -360,13 +360,13 @@ MITIE
 To use the MITIE pipeline, you will have to train word vectors from a corpus. Instructions can be found
 :ref:`here <mitie>`. This will give you the file path to pass to the ``model`` parameter.
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_mitie.yml
+.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_mitie.yml
     :language: yaml
 
 Another version of this pipeline uses MITIE's featurizer and also its multi-class classifier.
 Training can be quite slow, so this is not recommended for large datasets.
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_mitie_2.yml
+.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_mitie_2.yml
     :language: yaml
 
 
diff --git a/tests/utils/test_validation.py b/tests/utils/test_validation.py
index d691d07a2036..dc34c1ec64a3 100644
--- a/tests/utils/test_validation.py
+++ b/tests/utils/test_validation.py
@@ -10,9 +10,9 @@
     "file, schema",
     [
         ("examples/restaurantbot/domain.yml", DOMAIN_SCHEMA_FILE),
-        ("sample_configs/config_defaults.yml", CONFIG_SCHEMA_FILE),
-        ("sample_configs/config_supervised_embeddings.yml", CONFIG_SCHEMA_FILE),
-        ("sample_configs/config_crf_custom_features.yml", CONFIG_SCHEMA_FILE),
+        ("data/test_config/config_defaults.yml", CONFIG_SCHEMA_FILE),
+        ("data/test_config/config_supervised_embeddings.yml", CONFIG_SCHEMA_FILE),
+        ("data/test_config/config_crf_custom_features.yml", CONFIG_SCHEMA_FILE),
     ],
 )
 def test_validate_yaml_schema(file, schema):

From ad03e563e5ef1de2ed58c5e01219f2fb977e2a26 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 18 Feb 2020 16:51:07 +0100
Subject: [PATCH 427/633] fix tests

---
 tests/nlu/test_evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index d7251da69e6e..55829972f019 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -213,7 +213,7 @@ def test_determine_token_labels_throws_error():
         determine_token_labels(
             CH_correct_segmentation[0],
             [CH_correct_entity, CH_wrong_entity],
-            set(CRFEntityExtractor.name),
+            [CRFEntityExtractor.name],
         )
 
 
@@ -232,7 +232,7 @@ def test_determine_token_labels_with_extractors():
     determine_token_labels(
         CH_correct_segmentation[0],
         [CH_correct_entity, CH_wrong_entity],
-        set(SpacyEntityExtractor.name, MitieEntityExtractor.name),
+        [SpacyEntityExtractor.name, MitieEntityExtractor.name],
     )
 
 

From 62a9b9587bb831710716a4e487a86850b29f1c1c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 18 Feb 2020 22:57:11 +0100
Subject: [PATCH 428/633] validate required components

---
 rasa/nlu/components.py                        | 33 +++++++++++++++++++
 .../dense_featurizer/convert_featurizer.py    |  3 ++
 .../dense_featurizer/lm_featurizer.py         |  4 +++
 .../dense_featurizer/mitie_featurizer.py      |  4 +++
 .../dense_featurizer/spacy_featurizer.py      |  4 +++
 rasa/nlu/model.py                             |  7 ++--
 rasa/nlu/tokenizers/lm_tokenizer.py           |  3 ++
 rasa/nlu/tokenizers/spacy_tokenizer.py        |  3 ++
 8 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 2bfef406b050..10eaf5bcc4a5 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -51,6 +51,36 @@ def validate_requirements(component_names: List[Text]) -> None:
         )
 
 
+def validate_tokenizers(pipeline: List["Component"]) -> None:
+    from rasa.nlu.tokenizers.tokenizer import Tokenizer
+
+    tokenizer_names = []
+    for component in pipeline:
+        if isinstance(component, Tokenizer):
+            tokenizer_names.append(component.name)
+
+    if not tokenizer_names:
+        raise Exception(
+            f"No tokenizer is used. You should add one tokenizer to your pipeline."
+        )
+    elif len(tokenizer_names) > 1:
+        raise Exception(
+            f"More then one tokenizer is used: {tokenizer_names}. "
+            f"You can use only one tokenizer."
+        )
+
+
+def validate_required_components(pipeline: List["Component"]) -> None:
+    unique_component_names = set()
+    for component in pipeline:
+        unique_component_names.add(component.name)
+        if not set(component.required_components).issubset(unique_component_names):
+            raise Exception(
+                f"'{component.name}' requires {component.required_components}. "
+                f"Add required components to the pipeline"
+            )
+
+
 def validate_arguments(
     pipeline: List["Component"],
     context: Dict[Text, Any],
@@ -225,6 +255,9 @@ def name(self):
     # provided properties from the previous components.
     requires = []
 
+    # Which components are required by this component
+    required_components = []
+
     # Defines the default configuration parameters of a component
     # these values can be overwritten in the pipeline configuration
     # of the model. The component should choose sensible defaults
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index e2bc527302b8..b17ea18c342d 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -5,6 +5,7 @@
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
@@ -29,6 +30,8 @@ class ConveRTFeaturizer(Featurizer):
 
     requires = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
+    required_components = [ConveRTTokenizer.name]
+
     def _load_model(self) -> None:
 
         # needed in order to load model
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index eef8f16d1ac8..3a8e152c6773 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -3,6 +3,8 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
@@ -26,6 +28,8 @@ class LanguageModelFeaturizer(Featurizer):
         LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
+    required_components = [HFTransformersNLP.name, LanguageModelTokenizer.name]
+
     def train(
         self,
         training_data: TrainingData,
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index dcbc04355a3a..485b0808f4cf 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -5,6 +5,8 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.utils.mitie_utils import MitieNLP
+from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
@@ -27,6 +29,8 @@ class MitieFeaturizer(Featurizer):
         "mitie_feature_extractor"
     ]
 
+    required_components = [MitieNLP.name, MitieTokenizer.name]
+
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
         # the CLS token. Available options: 'mean' and 'max'
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 901ad394e345..378e715dca9c 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -4,6 +4,8 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
@@ -28,6 +30,8 @@ class SpacyFeaturizer(Featurizer):
         SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
+    required_components = [SpacyNLP.name, SpacyTokenizer.name]
+
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
         # the CLS token. Available options: 'mean' and 'max'
diff --git a/rasa/nlu/model.py b/rasa/nlu/model.py
index f353279cfa37..185ccc084baf 100644
--- a/rasa/nlu/model.py
+++ b/rasa/nlu/model.py
@@ -144,9 +144,8 @@ def __init__(
         # build pipeline
         self.pipeline = self._build_pipeline(cfg, component_builder)
 
-    @staticmethod
     def _build_pipeline(
-        cfg: RasaNLUModelConfig, component_builder: ComponentBuilder
+        self, cfg: RasaNLUModelConfig, component_builder: ComponentBuilder
     ) -> List[Component]:
         """Transform the passed names of the pipeline components into classes."""
 
@@ -158,6 +157,10 @@ def _build_pipeline(
             component = component_builder.create_component(component_cfg, cfg)
             pipeline.append(component)
 
+        if not self.skip_validation:
+            components.validate_tokenizers(pipeline)
+            components.validate_required_components(pipeline)
+
         return pipeline
 
     def train(self, data: TrainingData, **kwargs: Any) -> "Interpreter":
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index 0a12e65576c1..7fcbe5aca3d0 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -1,6 +1,7 @@
 from typing import Text, List, Any, Dict
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.nlu.training_data import Message
 
 from rasa.nlu.constants import (
@@ -20,6 +21,8 @@ class LanguageModelTokenizer(Tokenizer):
         LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
+    required_components = [HFTransformersNLP.name]
+
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 6e1b8462c9bd..08ef77cfbdfa 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -2,6 +2,7 @@
 from typing import Text, List, Any
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.training_data import Message
 
 from rasa.nlu.constants import TOKENS_NAMES, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
@@ -21,6 +22,8 @@ class SpacyTokenizer(Tokenizer):
 
     requires = [SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
+    required_components = [SpacyNLP.name]
+
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,

From 62ca53e5679207ca16c6e3d7ae018644b1e1e1ea Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 08:26:01 +0100
Subject: [PATCH 429/633] test fix on gh actions

---
 Makefile                                          | 2 +-
 rasa/nlu/classifiers/sklearn_intent_classifier.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 47a6b9d93a10..176ee0714c83 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ prepare-tests-files:
 
 test: clean
 	# OMP_NUM_THREADS can improve overral performance using one thread by process (on tensorflow), avoiding overload
-	OMP_NUM_THREADS=1 pytest tests -n $(JOBS) --cov rasa
+	OMP_NUM_THREADS=1 pytest tests/nlu/test_evaluation.py -n $(JOBS) --cov rasa
 
 doctest: clean
 	cd docs && make doctest
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index cfc976d0966a..1c85a678ceee 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -145,7 +145,7 @@ def _create_classifier(
         # dirty str fix because sklearn is expecting
         # str not instance of basestr...
         tuned_parameters = [
-            {"C": C, "gamma": gamma, "kernel": [str(k) for k in kernels]}
+            {"C": np.array(C), "gamma": gamma, "kernel": [str(k) for k in kernels]}
         ]
 
         # aim for 5 examples in each fold

From 2a6c4e41e5d6d24f7139914875735d17bd751692 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 08:42:30 +0100
Subject: [PATCH 430/633] test everything

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 176ee0714c83..47a6b9d93a10 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ prepare-tests-files:
 
 test: clean
 	# OMP_NUM_THREADS can improve overral performance using one thread by process (on tensorflow), avoiding overload
-	OMP_NUM_THREADS=1 pytest tests/nlu/test_evaluation.py -n $(JOBS) --cov rasa
+	OMP_NUM_THREADS=1 pytest tests -n $(JOBS) --cov rasa
 
 doctest: clean
 	cd docs && make doctest

From 3c1551f6432e151f6585e2b11b265ae0e9d404d0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 08:45:15 +0100
Subject: [PATCH 431/633] update docker configs

---
 docker/Dockerfile_full                              |  2 +-
 docker/Dockerfile_pretrained_embeddings_mitie_en    |  2 +-
 docker/Dockerfile_pretrained_embeddings_spacy_de    |  2 +-
 docker/Dockerfile_pretrained_embeddings_spacy_en    |  2 +-
 .../configs/config_pretrained_embeddings_mitie.yml  | 11 +++++++++++
 .../configs/config_pretrained_embeddings_spacy.yml  |  3 +++
 .../config_pretrained_embeddings_spacy_de.yml       |  3 +++
 .../config_pretrained_embeddings_spacy_duckling.yml | 13 +++++++++++++
 8 files changed, 34 insertions(+), 4 deletions(-)
 create mode 100644 docker/configs/config_pretrained_embeddings_mitie.yml
 create mode 100644 docker/configs/config_pretrained_embeddings_spacy.yml
 create mode 100644 docker/configs/config_pretrained_embeddings_spacy_de.yml
 create mode 100644 docker/configs/config_pretrained_embeddings_spacy_duckling.yml

diff --git a/docker/Dockerfile_full b/docker/Dockerfile_full
index 3209ad3bf4a8..e75d4b6c12a0 100644
--- a/docker/Dockerfile_full
+++ b/docker/Dockerfile_full
@@ -72,7 +72,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY data/test_config/config_pretrained_embeddings_spacy_duckling.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_duckling.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_mitie_en
index 8e7acdff149f..663986b7faf1 100644
--- a/docker/Dockerfile_pretrained_embeddings_mitie_en
+++ b/docker/Dockerfile_pretrained_embeddings_mitie_en
@@ -66,7 +66,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY data/test_config/config_pretrained_embeddings_mitie.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_mitie.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_de b/docker/Dockerfile_pretrained_embeddings_spacy_de
index 68975812e80c..55b2011fc991 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_de
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_de
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY data/test_config/config_pretrained_embeddings_spacy_de.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_de.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_en b/docker/Dockerfile_pretrained_embeddings_spacy_en
index 96976d18e8f9..1f54d2b72ed4 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_en
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_en
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY data/test_config/config_pretrained_embeddings_spacy.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/configs/config_pretrained_embeddings_mitie.yml b/docker/configs/config_pretrained_embeddings_mitie.yml
new file mode 100644
index 000000000000..d1b8b86dd953
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_mitie.yml
@@ -0,0 +1,11 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  model: "data/total_word_feature_extractor.dat"
+- name: "MitieTokenizer"
+- name: "MitieEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "RegexFeaturizer"
+- name: "MitieFeaturizer"
+- name: "SklearnIntentClassifier"
diff --git a/docker/configs/config_pretrained_embeddings_spacy.yml b/docker/configs/config_pretrained_embeddings_spacy.yml
new file mode 100644
index 000000000000..3516519cd529
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "pretrained_embeddings_spacy"
diff --git a/docker/configs/config_pretrained_embeddings_spacy_de.yml b/docker/configs/config_pretrained_embeddings_spacy_de.yml
new file mode 100644
index 000000000000..7345028fab2e
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy_de.yml
@@ -0,0 +1,3 @@
+language: "de"
+
+pipeline: "pretrained_embeddings_spacy"
diff --git a/docker/configs/config_pretrained_embeddings_spacy_duckling.yml b/docker/configs/config_pretrained_embeddings_spacy_duckling.yml
new file mode 100644
index 000000000000..62fce290bef6
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy_duckling.yml
@@ -0,0 +1,13 @@
+language: "en"
+
+pipeline:
+- name: "SpacyNLP"
+- name: "SpacyTokenizer"
+- name: "RegexFeaturizer"
+- name: "SpacyFeaturizer"
+- name: "CRFEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "SklearnIntentClassifier"
+- name: "DucklingHTTPExtractor"
+  url: "http://duckling:8000"
+  
\ No newline at end of file

From 88c9b65f1833b88fa1a2419b2101a0c37eaa8907 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 09:52:17 +0100
Subject: [PATCH 432/633] update components.rst

---
 docs/nlu/components.rst                 | 210 ++++++++++++++++++------
 rasa/nlu/selectors/diet_selector.py     |  12 +-
 rasa/nlu/selectors/response_selector.py |   3 -
 3 files changed, 167 insertions(+), 58 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 16cf5320b13c..650f34890349 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -15,6 +15,10 @@ Components
    ``supervised_embeddings``, and ``spacy_sklearn`` is now known as
    ``pretrained_embeddings_spacy``. Please update your code if you are using these.
 
+.. note::
+    We deprecated all pre-defined pipeline templates. Take a look at :ref:`choosing-a-pipeline`
+    to decide on what components you should use in your configuration file.
+
 This is a reference of the configuration options for every built-in
 component in Rasa NLU. If you want to build a custom component, check
 out :ref:`custom-nlu-components`.
@@ -61,7 +65,7 @@ SpacyNLP
 :Outputs: Nothing
 :Requires: Nothing
 :Description:
-    Initializes spacy structures. Every spaCy component relies on this, hence this should be put at the beginning
+    Initializes spaCy structures. Every spaCy component relies on this, hence this should be put at the beginning
     of every pipeline that uses any spaCy components.
 :Configuration:
     Language model, default will use the configured language.
@@ -91,12 +95,12 @@ HFTransformersNLP
 ~~~~~~~~~~~~~~~~~
 
 :Short: HuggingFace's Transformers based pre-trained language model initializer
-:Outputs: nothing
-:Requires: nothing
+:Outputs: Nothing
+:Requires: Nothing
 :Description:
     Initializes specified pre-trained language model from HuggingFace's `Transformers library
-    <https://huggingface.co/transformers/>`__.  The component applies language model specific tokenization and featurization
-    to compute sequence and sentence level representations for each example in the training data.
+    <https://huggingface.co/transformers/>`__.  The component applies language model specific tokenization and
+    featurization to compute sequence and sentence level representations for each example in the training data.
     Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
     component for downstream NLU models.
 :Configuration:
@@ -265,6 +269,26 @@ ConveRTTokenizer
           "case_sensitive": True
 
 
+.. _LanguageModelTokenizer:
+
+LanguageModelTokenizer
+~~~~~~~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer from pre-trained language models
+:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Requires: :ref:`HFTransformersNLP`
+:Description:
+    Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
+    Must be used whenever the ``LanguageModelFeaturizer`` is used.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "LanguageModelTokenizer"
+
+
+
 .. _text-featurizers:
 
 Text Featurizers
@@ -274,7 +298,7 @@ Text featurizers are divided into two different categories: sparse featurizers a
 Sparse featurizers are featurizers that return feature vectors with a lot of missing values, e.g. zeros.
 As those feature vectors would normally take up a lot of memory, we store them as sparse features.
 Sparse features only store the values that are non zero and their positions in the vector.
-Thus, we save a lot of memroy and are able to train on larger datasets.
+Thus, we save a lot of memory and are able to train on larger datasets.
 
 By default all featurizers will return a matrix of length (number-of-tokens x feature-dimension).
 So, the returned matrix will have a feature vector for every token.
@@ -287,7 +311,7 @@ MitieFeaturizer
 ~~~~~~~~~~~~~~~
 
 :Short:
-    Creates a vector representation of user message and response (if specified) using the spaCy featurizer.
+    Creates a vector representation of user message and response (if specified) using the MITIE featurizer.
 :Outputs: ``dense_features`` for texts and responses
 :Requires: :ref:`MitieNLP`
 :Type: Dense featurizer
@@ -300,7 +324,7 @@ MitieFeaturizer
         NOT used by the ``MitieIntentClassifier`` component.
 
 :Configuration:
-    The sentence vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
+    The sentence vector, e.g. the vector of the ``__CLS__`` token can be calculated in two different ways, either via
     mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
     The default pooling method is set to ``mean``.
 
@@ -309,7 +333,7 @@ MitieFeaturizer
         pipeline:
         - name: "MitieFeaturizer"
           # Specify what pooling operation should be used to calculate the vector of
-          # the CLS token. Available options: 'mean' and 'max'.
+          # the __CLS__ token. Available options: 'mean' and 'max'.
           "pooling": "mean"
 
 
@@ -325,7 +349,7 @@ SpacyFeaturizer
     Creates features for entity extraction, intent classification, and response classification using the spaCy
     featurizer.
 :Configuration:
-    The sentence vector, e.g. the vector of the ``CLS`` token can be calculated in two different ways, either via
+    The sentence vector, e.g. the vector of the ``__CLS__`` token can be calculated in two different ways, either via
     mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
     The default pooling method is set to ``mean``.
 
@@ -334,7 +358,7 @@ SpacyFeaturizer
         pipeline:
         - name: "SpacyFeaturizer"
           # Specify what pooling operation should be used to calculate the vector of
-          # the CLS token. Available options: 'mean' and 'max'.
+          # the __CLS__ token. Available options: 'mean' and 'max'.
           "pooling": "mean"
 
 
@@ -375,13 +399,11 @@ LanguageModelFeaturizer
 
 :Short:
     Creates a vector representation of user message and response (if specified) using a pre-trained language model.
-:Outputs:
-    nothing, used as an input to intent classifiers and response selectors that need intent features and response
-    features respectively (e.g. ``DIETClassifier`` and ``ResponseSelector``)
+:Outputs: ``dense_features`` for texts and responses
 :Requires: :ref:`HFTransformersNLP`
 :Type: Dense featurizer
 :Description:
-    Creates features for intent classification and response selection.
+    Creates features for entity extraction, intent classification, and response selection.
     Uses the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component to compute vector
     representations of input text.
 
@@ -391,14 +413,12 @@ LanguageModelFeaturizer
 
 :Configuration:
 
-    Include ``HFTransformersNLP`` component before this component. Also, use :ref:`LanguageModelTokenizer` to ensure tokens
-    are correctly set for all components throughout the pipeline.
+    Include ``HFTransformersNLP`` component before this component. Also, use :ref:`LanguageModelTokenizer` to ensure
+    tokens are correctly set for all components throughout the pipeline.
 
     .. code-block:: yaml
 
         pipeline:
-        - name: "HFTransformersNLP"
-          model_name: # Name of language model to use
         - name: "LanguageModelFeaturizer"
 
 
@@ -416,7 +436,8 @@ RegexFeaturizer
     For each regex, a feature will be set marking whether this expression was found in the input, which will later
     be fed into intent classifier / entity extractor to simplify classification (assuming the classifier has learned
     during the training phase, that this set feature indicates a certain intent / entity).
-    Regex features for entity extraction are currently only supported by the ``CRFEntityExtractor`` component!
+    Regex features for entity extraction are currently only supported by the ``CRFEntityExtractor`` and the
+    ``DIETClassifier`` components!
 
 :Configuration:
 
@@ -574,7 +595,7 @@ LexicalSyntacticFeaturizer
     ==============  =============================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
-    previous words, the current word, and the next words in the sliding window.
+    previous tokens, the current token, and the next tokens in the sliding window.
     You define the features as [before, token, after] array.
     If you, for example, want to define features for the token before, the current token, and the token after,
     your features configuration could look like this:
@@ -664,10 +685,9 @@ SklearnIntentClassifier
         }
 
 :Description:
-    The sklearn intent classifier trains a linear SVM which gets optimized using a grid search. In addition
-    to other classifiers it also provides rankings of the labels that did not "win". The ``SklearnIntentClassifier``
-    needs to be preceded by a dense featurizer in the pipeline. This dense featurizer creates the features used for
-    the classification.
+    The sklearn intent classifier trains a linear SVM which gets optimized using a grid search. It also provides
+    rankings of the labels that did not "win". The ``SklearnIntentClassifier`` needs to be preceded by a dense
+    featurizer in the pipeline. This dense featurizer creates the features used for the classification.
 
 :Configuration:
     During the training of the SVM a hyperparameter search is run to
@@ -940,6 +960,121 @@ ResponseSelector
     Response Selector component can be used to build a response retrieval model to directly predict a bot response from
     a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
     It embeds user inputs and response labels into the same space and follows the exact same
+    neural network architecture and optimization as the ``EmbeddingIntentClassifier``.
+
+    .. note:: If during prediction time a message contains **only** words unseen during training,
+              and no Out-Of-Vacabulary preprocessor was used,
+              empty response ``None`` is predicted with confidence ``0.0``.
+
+    .. warning::
+        ``ResponseSelector`` is deprecated and should be replaced by ``DIETSelector``. See
+        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
+
+:Configuration:
+
+    The algorithm includes all the hyperparameters that ``EmbeddingIntentClassifier`` uses.
+    In addition, the component can also be configured to train a response selector for a particular retrieval intent.
+
+        - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
+
+    Default values:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "ResponseSelector"
+            # nn architecture
+            # sizes of hidden layers before the embedding layer
+            # for input words and intent labels,
+            # the number of hidden layers is thus equal to the length of this list
+            "hidden_layers_sizes": {"text": [], "label": []}
+            # Whether to share the hidden layer weights between input words and labels
+            "share_hidden_layers": False
+            # training parameters
+            # initial and final batch sizes - batch size will be
+            # linearly increased for each epoch
+            "batch_size": [64, 256]
+            # how to create batches
+            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
+            # number of epochs
+            "epochs": 300
+            # set random seed to any int to get reproducible results
+            "random_seed": None
+            # optimizer
+            "learning_rate": 0.001
+            # embedding parameters
+            # default dense dimension used if no dense features are present
+            "dense_dimension": {"text": 512, "label": 512}
+            # dimension size of embedding vectors
+            "embedding_dimension": 20
+            # the type of the similarity
+            "number_of_negative_examples": 20
+            # flag if minimize only maximum similarity over incorrect actions
+            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
+            # the type of the loss function
+            "loss_type": "softmax"  # string 'softmax' or 'margin'
+            # number of top intents to normalize scores for softmax loss_type
+            # set to 0 to turn off normalization
+            "ranking_length": 10
+            # how similar the algorithm should try
+            # to make embedding vectors for correct labels
+            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
+            # maximum negative similarity for incorrect labels
+            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
+            # flag: if true, only minimize the maximum similarity for incorrect labels
+            "use_maximum_negative_similarity": True
+            # scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # regularization parameters
+            # the scale of regularization
+            "regularization_constant": 0.002
+            # the scale of how critical the algorithm should be of minimizing the
+            # maximum similarity between embeddings of different labels
+            "negative_margin_scale": 0.8
+            # dropout rate for rnn
+            "droprate": 0.2
+            # if true apply dropout to sparse tensors
+            "use_sparse_input_dropout": True
+            # visualization of accuracy
+            # how often to calculate training accuracy
+            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
+            # how many examples to use for calculation of training accuracy
+            "evaluate_on_number_of_examples": 0  # large values may hurt performance
+            # selector config
+            # name of the intent for which this response selector is to be trained
+            "retrieval_intent": None
+
+
+.. _diet-selector:
+
+DIETSelector
+~~~~~~~~~~~~~~~~
+
+:Short: DIET Selector
+:Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and response
+
+:Output-Example:
+
+    .. code-block:: json
+
+        {
+            "response_selector": {
+              "faq": {
+                "response": {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
+                "ranking": [
+                    {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
+                    {"confidence": 0.2134543431, "name": "You can ask me about how to get started"}
+                ]
+              }
+            }
+        }
+
+:Description:
+
+    DIET Selector component can be used to build a response retrieval model to directly predict a bot response from
+    a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
+    It embeds user inputs and response labels into the same space and follows the exact same
     neural network architecture and optimization as the ``DIETClassifier``.
 
     .. note:: If during prediction time a message contains **only** words unseen during training,
@@ -1041,29 +1176,6 @@ ResponseSelector
             # name of the intent for which this response selector is to be trained
             "retrieval_intent": None
 
-.. _LanguageModelTokenizer:
-
-LanguageModelTokenizer
-~~~~~~~~~~~~~~~~~~~~~~
-
-:Short: Tokenizer from pre-trained language models
-:Outputs: nothing
-:Requires: :ref:`HFTransformersNLP`
-:Description:
-    Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
-    Must be used whenever the ``LanguageModelFeaturizer`` is used.
-:Configuration:
-
-    Include ``HFTransformersNLP`` component upstream.
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "HFTransformersNLP"
-          model_name: # name of language model to use
-        - name: "LanguageModelTokenizer"
-
-
 
 Entity Extractors
 -----------------
@@ -1407,7 +1519,7 @@ Combined Entity Extractors and Intent Classifiers
 DIETClassifier
 ~~~~~~~~~~~~~~
 
-:Short: Dual Intent Entity Transformer used for intent classification and entity extraction
+:Short: Dual Intent Entity Transformer (DIET) used for intent classification and entity extraction
 :Outputs: ``entities``, ``intent`` and ``intent_ranking``
 :Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
 :Output-Example:
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
index fa3dab4b6f48..a6e691a7c8cb 100644
--- a/rasa/nlu/selectors/diet_selector.py
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -38,14 +38,14 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    USE_MAX_SIM_NEG,
-    MU_NEG,
-    MU_POS,
     EMBED_DIM,
     BILOU_FLAG,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -133,12 +133,12 @@ class DIETSelector(DIETClassifier):
         RANKING_LENGTH: 10,
         # how similar the algorithm should try
         # to make embedding vectors for correct intent labels
-        MU_POS: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
+        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
         # maximum negative similarity for incorrect intent labels
-        MU_NEG: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
+        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
         # flag: if true, only minimize the maximum similarity for
         # incorrect intent labels
-        USE_MAX_SIM_NEG: True,
+        USE_MAX_NEG_SIM: True,
         # scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # regularization parameters
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 9b13bc84896d..7e1be429431d 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -139,9 +139,6 @@ class ResponseSelector(DIETClassifier):
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
-        # if true random tokens of the input message will be masked and the model
-        # should predict those tokens
-        MASKED_LM: False,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,

From 17b8476d6fed7308cc2dd4401f04fbafaf232650 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 10:05:20 +0100
Subject: [PATCH 433/633] update scope of config in conftest

---
 rasa/nlu/classifiers/sklearn_intent_classifier.py | 2 +-
 tests/nlu/conftest.py                             | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 1c85a678ceee..cfc976d0966a 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -145,7 +145,7 @@ def _create_classifier(
         # dirty str fix because sklearn is expecting
         # str not instance of basestr...
         tuned_parameters = [
-            {"C": np.array(C), "gamma": gamma, "kernel": [str(k) for k in kernels]}
+            {"C": C, "gamma": gamma, "kernel": [str(k) for k in kernels]}
         ]
 
         # aim for 5 examples in each fold
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index d0de170f26f4..9c6205f1a479 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -37,7 +37,7 @@ def mitie_feature_extractor(
     ).extractor
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def default_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig({"language": "en", "pipeline": []})
 
@@ -57,7 +57,7 @@ def config_path() -> Text:
     ).name
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig(
         {
@@ -75,7 +75,7 @@ def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
     )
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def supervised_embeddings_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig(
         {
@@ -98,7 +98,7 @@ def supervised_embeddings_config() -> RasaNLUModelConfig:
     )
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture()
 def pretrained_embeddings_convert_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig(
         {

From c8271a1a996957ca85db57b0bab373b3462a3e10 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 10:40:29 +0100
Subject: [PATCH 434/633] set batch_strategy on crf entity extractor

---
 rasa/nlu/extractors/crf_entity_extractor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index d055ebcb4baf..6c500f9ed61f 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -29,6 +29,7 @@
     DROPRATE,
     REGULARIZATION_CONSTANT,
     BILOU_FLAG,
+    BATCH_STRATEGY,
 )
 from rasa.utils.common import raise_warning
 from rasa.utils.tensorflow.models import RasaModel
@@ -118,6 +119,7 @@ def __init__(
         component_config[ENTITY_RECOGNITION] = True
         component_config[MASKED_LM] = False
         component_config[NUM_TRANSFORMER_LAYERS] = 0
+        component_config[BATCH_STRATEGY] = "sequence"
 
         super().__init__(
             component_config,

From 45a639f9b61d891b5c3f2e9a07ec593aec3ce41f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 10:56:13 +0100
Subject: [PATCH 435/633] don't check if tokenizer is present

---
 rasa/nlu/components.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 10eaf5bcc4a5..1ca126320db6 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -59,11 +59,7 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
         if isinstance(component, Tokenizer):
             tokenizer_names.append(component.name)
 
-    if not tokenizer_names:
-        raise Exception(
-            f"No tokenizer is used. You should add one tokenizer to your pipeline."
-        )
-    elif len(tokenizer_names) > 1:
+    if len(tokenizer_names) > 1:
         raise Exception(
             f"More then one tokenizer is used: {tokenizer_names}. "
             f"You can use only one tokenizer."

From 26133f56d5d468c85ef74b76917ca95d1cf7cd08 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 10:56:58 +0100
Subject: [PATCH 436/633] add point

---
 rasa/nlu/components.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 1ca126320db6..9e1795ed1064 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -73,7 +73,7 @@ def validate_required_components(pipeline: List["Component"]) -> None:
         if not set(component.required_components).issubset(unique_component_names):
             raise Exception(
                 f"'{component.name}' requires {component.required_components}. "
-                f"Add required components to the pipeline"
+                f"Add required components to the pipeline."
             )
 
 

From 4553dadad8f0aaa57968a5ab5c3874d28d2dcda0 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 10:58:31 +0100
Subject: [PATCH 437/633] Update rasa/core/policies/keras_policy.py

Co-Authored-By: Daksh Varshneya <d.varshneya@rasa.com>
---
 rasa/core/policies/keras_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 5d95b799e0af..25a786b8f425 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -75,7 +75,7 @@ def __init__(
             "'KerasPolicy' is deprecated and will be removed in version "
             "2.0. Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs=f"{DOCS_BASE_URL}/core/policies/",
+            docs=f"{DOCS_URL_POLICIES}",
         )
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:

From c4dc95e92de304ae9d02b21c975107d448d51a17 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:01:20 +0100
Subject: [PATCH 438/633] fix random seed

---
 rasa/core/policies/embedding_policy.py  |  4 ++--
 rasa/nlu/selectors/response_selector.py |  4 ++--
 tests/nlu/conftest.py                   | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 84bbd8572459..b4b4da1328c5 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -4,7 +4,7 @@
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.ted_policy import TEDPolicy
-from rasa.constants import DOCS_BASE_URL
+from rasa.constants import DOCS_URL_POLICIES
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -135,5 +135,5 @@ def __init__(
             f"'EmbeddingPolicy' is deprecated and will be removed in version 2.0. "
             f"Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs=f"{DOCS_BASE_URL}/core/policies/",
+            docs=DOCS_URL_POLICIES,
         )
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 88c5f1cd5bf1..bfa04a1c1d8e 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -47,7 +47,7 @@
 from rasa.utils.tensorflow.model_data import RasaModelData
 from rasa.utils.tensorflow.models import RasaModel
 from rasa.utils.common import raise_warning
-from rasa.constants import DOCS_BASE_URL
+from rasa.constants import DOCS_URL_COMPONENTS
 
 logger = logging.getLogger(__name__)
 
@@ -178,7 +178,7 @@ def __init__(
             f"'ResponseSelector' is deprecated and will be removed in version 2.0. "
             f"Use 'DIETSelector' instead.",
             category=FutureWarning,
-            docs=f"{DOCS_BASE_URL}/nlu/components/",
+            docs=DOCS_URL_COMPONENTS,
         )
 
     @property
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 9c6205f1a479..907be110241b 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -4,7 +4,7 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.components import ComponentBuilder
-from rasa.utils.tensorflow.constants import EPOCHS
+from rasa.utils.tensorflow.constants import EPOCHS, RANDOM_SEED
 from tests.nlu.utilities import write_file_config
 
 DEFAULT_DATA_PATH = "data/examples/rasa/demo-rasa.json"
@@ -37,7 +37,7 @@ def mitie_feature_extractor(
     ).extractor
 
 
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def default_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig({"language": "en", "pipeline": []})
 
@@ -49,9 +49,9 @@ def config_path() -> Text:
             "language": "en",
             "pipeline": [
                 {"name": "WhitespaceTokenizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 2},
+                {"name": "CRFEntityExtractor", EPOCHS: 2, RANDOM_SEED: 42},
                 {"name": "CountVectorsFeaturizer"},
-                {"name": "EmbeddingIntentClassifier", EPOCHS: 2},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 2, RANDOM_SEED: 42},
             ],
         }
     ).name
@@ -67,7 +67,7 @@ def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
                 {"name": "SpacyTokenizer"},
                 {"name": "SpacyFeaturizer"},
                 {"name": "RegexFeaturizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 3},
+                {"name": "CRFEntityExtractor", EPOCHS: 3, RANDOM_SEED: 42},
                 {"name": "EntitySynonymMapper"},
                 {"name": "SklearnIntentClassifier"},
             ],
@@ -83,7 +83,7 @@ def supervised_embeddings_config() -> RasaNLUModelConfig:
             "pipeline": [
                 {"name": "WhitespaceTokenizer"},
                 {"name": "RegexFeaturizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 3},
+                {"name": "CRFEntityExtractor", EPOCHS: 3, RANDOM_SEED: 42},
                 {"name": "EntitySynonymMapper"},
                 {"name": "CountVectorsFeaturizer"},
                 {
@@ -92,7 +92,7 @@ def supervised_embeddings_config() -> RasaNLUModelConfig:
                     "min_ngram": 1,
                     "max_ngram": 4,
                 },
-                {"name": "EmbeddingIntentClassifier", EPOCHS: 3},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 3, RANDOM_SEED: 42},
             ],
         }
     )

From 6f5c112a6482c2ac39fdf9fe8433e798b9691ec0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:06:49 +0100
Subject: [PATCH 439/633] update docerfiles with new pipelines

---
 docker/Dockerfile_full                          |  2 +-
 ...Dockerfile_pretrained_embeddings_convert_en} | 12 +++---------
 .../Dockerfile_pretrained_embeddings_spacy_en   |  2 +-
 .../config_pretrained_embeddings_convert.yml    | 15 +++++++++++++++
 .../config_pretrained_embeddings_mitie.yml      | 11 -----------
 .../config_pretrained_embeddings_spacy.yml      |  3 ---
 .../config_pretrained_embeddings_spacy_de.yml   | 15 ++++++++++++++-
 ...fig_pretrained_embeddings_spacy_duckling.yml | 13 -------------
 .../config_pretrained_embeddings_spacy_en.yml   | 16 ++++++++++++++++
 .../config_supervised_embeddings_duckling.yml   | 17 +++++++++++++++++
 10 files changed, 67 insertions(+), 39 deletions(-)
 rename docker/{Dockerfile_pretrained_embeddings_mitie_en => Dockerfile_pretrained_embeddings_convert_en} (81%)
 create mode 100644 docker/configs/config_pretrained_embeddings_convert.yml
 delete mode 100644 docker/configs/config_pretrained_embeddings_mitie.yml
 delete mode 100644 docker/configs/config_pretrained_embeddings_spacy.yml
 delete mode 100644 docker/configs/config_pretrained_embeddings_spacy_duckling.yml
 create mode 100644 docker/configs/config_pretrained_embeddings_spacy_en.yml
 create mode 100644 docker/configs/config_supervised_embeddings_duckling.yml

diff --git a/docker/Dockerfile_full b/docker/Dockerfile_full
index e75d4b6c12a0..c697018d783f 100644
--- a/docker/Dockerfile_full
+++ b/docker/Dockerfile_full
@@ -72,7 +72,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY docker/configs/config_pretrained_embeddings_spacy_duckling.yml config.yml
+COPY docker/configs/config_supervised_embeddings_duckling.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_convert_en
similarity index 81%
rename from docker/Dockerfile_pretrained_embeddings_mitie_en
rename to docker/Dockerfile_pretrained_embeddings_convert_en
index 663986b7faf1..6d58722df569 100644
--- a/docker/Dockerfile_pretrained_embeddings_mitie_en
+++ b/docker/Dockerfile_pretrained_embeddings_convert_en
@@ -41,9 +41,6 @@ RUN apt-get update -qq \
 # Make sure we have the latest pip version
 RUN pip install -U pip
 
-# Download mitie model
-RUN wget -P /app/data/ https://s3-eu-west-1.amazonaws.com/mitie/total_word_feature_extractor.dat
-
 # Copy only what we really need
 COPY README.md .
 COPY setup.py .
@@ -54,11 +51,11 @@ COPY requirements.txt .
 COPY LICENSE.txt .
 
 # Install dependencies
-RUN pip install --no-cache-dir -r alt_requirements/requirements_pretrained_embeddings_mitie.txt
+RUN pip install --no-cache-dir -r alt_requirements/requirements_pretrained_embeddings_convert.txt
 
 # Install Rasa as package
 COPY rasa ./rasa
-RUN pip install .[sql,mitie]
+RUN pip install .[sql,convert]
 
 # Runtime stage which uses the virtualenv which we built in the previous stage
 FROM base AS runner
@@ -66,10 +63,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY docker/configs/config_pretrained_embeddings_mitie.yml config.yml
-
-# Copy over mitie model
-COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
+COPY docker/configs/config_pretrained_embeddings_convert.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_en b/docker/Dockerfile_pretrained_embeddings_spacy_en
index 1f54d2b72ed4..6796e1af18ed 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_en
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_en
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY docker/configs/config_pretrained_embeddings_spacy.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_en.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/configs/config_pretrained_embeddings_convert.yml b/docker/configs/config_pretrained_embeddings_convert.yml
new file mode 100644
index 000000000000..c059db91d4db
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_convert.yml
@@ -0,0 +1,15 @@
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
diff --git a/docker/configs/config_pretrained_embeddings_mitie.yml b/docker/configs/config_pretrained_embeddings_mitie.yml
deleted file mode 100644
index d1b8b86dd953..000000000000
--- a/docker/configs/config_pretrained_embeddings_mitie.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-language: "en"
-
-pipeline:
-- name: "MitieNLP"
-  model: "data/total_word_feature_extractor.dat"
-- name: "MitieTokenizer"
-- name: "MitieEntityExtractor"
-- name: "EntitySynonymMapper"
-- name: "RegexFeaturizer"
-- name: "MitieFeaturizer"
-- name: "SklearnIntentClassifier"
diff --git a/docker/configs/config_pretrained_embeddings_spacy.yml b/docker/configs/config_pretrained_embeddings_spacy.yml
deleted file mode 100644
index 3516519cd529..000000000000
--- a/docker/configs/config_pretrained_embeddings_spacy.yml
+++ /dev/null
@@ -1,3 +0,0 @@
-language: "en"
-
-pipeline: "pretrained_embeddings_spacy"
diff --git a/docker/configs/config_pretrained_embeddings_spacy_de.yml b/docker/configs/config_pretrained_embeddings_spacy_de.yml
index 7345028fab2e..e4ab976a258d 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_de.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_de.yml
@@ -1,3 +1,16 @@
 language: "de"
 
-pipeline: "pretrained_embeddings_spacy"
+pipeline:
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
\ No newline at end of file
diff --git a/docker/configs/config_pretrained_embeddings_spacy_duckling.yml b/docker/configs/config_pretrained_embeddings_spacy_duckling.yml
deleted file mode 100644
index 62fce290bef6..000000000000
--- a/docker/configs/config_pretrained_embeddings_spacy_duckling.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-language: "en"
-
-pipeline:
-- name: "SpacyNLP"
-- name: "SpacyTokenizer"
-- name: "RegexFeaturizer"
-- name: "SpacyFeaturizer"
-- name: "CRFEntityExtractor"
-- name: "EntitySynonymMapper"
-- name: "SklearnIntentClassifier"
-- name: "DucklingHTTPExtractor"
-  url: "http://duckling:8000"
-  
\ No newline at end of file
diff --git a/docker/configs/config_pretrained_embeddings_spacy_en.yml b/docker/configs/config_pretrained_embeddings_spacy_en.yml
new file mode 100644
index 000000000000..244b5dc1a9a4
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy_en.yml
@@ -0,0 +1,16 @@
+language: "en"
+
+pipeline:
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
\ No newline at end of file
diff --git a/docker/configs/config_supervised_embeddings_duckling.yml b/docker/configs/config_supervised_embeddings_duckling.yml
new file mode 100644
index 000000000000..3bb09d39765d
--- /dev/null
+++ b/docker/configs/config_supervised_embeddings_duckling.yml
@@ -0,0 +1,17 @@
+language: "en"
+
+pipeline:
+  - name: WhitespaceTokenizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
+  - name: DucklingHTTPExtractor
+    url: "http://duckling:8000"
+  
\ No newline at end of file

From 9b18010ff9e07a022d8a1bf3ccb43d16cdb87005 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:13:45 +0100
Subject: [PATCH 440/633] change default of lexical syntactiv featurizer

---
 .../lexical_syntactic_featurizer.py                | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 4a003c6747fb..2b71bd6a2459 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -32,19 +32,7 @@ class LexicalSyntacticFeaturizer(Featurizer):
         # POS features require 'SpacyTokenizer'.
         "features": [
             ["low", "title", "upper"],
-            [
-                "BOS",
-                "EOS",
-                "low",
-                "prefix5",
-                "prefix2",
-                "suffix5",
-                "suffix3",
-                "suffix2",
-                "upper",
-                "title",
-                "digit",
-            ],
+            ["BOS", "EOS", "low", "upper", "title", "digit"],
             ["low", "title", "upper"],
         ]
     }

From 2815c61e902c19748ab0230aca28344ae5855601 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:34:44 +0100
Subject: [PATCH 441/633] update migration guide

---
 docs/migration-guide.rst | 33 +++++++++++++++++++++++++++------
 docs/nlu/components.rst  |  8 +++++++-
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 967cbc3481e9..0cbd42311df4 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -36,6 +36,9 @@ General
   ``epochs``. ``max_history`` is particularly important and strongly depends on your stories.
   Please see the docs of the :ref:`embedding_policy` if you want to customize them.
 
+- All pre-defined pipeline templates are deprecated. Take a look at :ref:`choosing-a-pipeline`
+  to decide on what components you should use in your configuration file.
+
 - The :ref:`embedding_policy` got renamed to :ref:`ted_policy`. The functionality of the policy stayed the same.
   Please update your configuration files to use ``TEDPolicy`` instead of ``EmbeddingPolicy``.
 
@@ -66,13 +69,13 @@ General
   evaluate_on_num_examples       evaluate_on_number_of_examples
   =============================  =======================================================
 
-  A warning will be logged in case an old option is used.
+  A warning will be logged in case an old option is used. However, you can still use the old configuration options.
+  They will be mapped to the new names.
 
 - ``EmbeddingIntentClassifier`` is now deprecated and will be replaced by ``DIETClassifier`` in the future.
-  ``DIETClassifier`` builds on top the model architecture of the ``EmbeddingIntentClassifier``. ``DIETClassfier``
-  allows you to train one model for entity extraction and intent classification. However, if you want to
-  get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use the following configuration of
-  ``DIETClassifier``:
+  ``DIETClassfier`` is based on a multi-task architecture for intent classification and entity recognition.
+  However, if you want to get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use
+  the following configuration of ``DIETClassifier``:
 
   .. code-block:: yaml
 
@@ -89,7 +92,7 @@ General
   See :ref:`diet-classifier` for more information about the new component.
 
 - ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. ``DIETClassfier``
-  allows you to train one model for entity extraction and intent classification. However, if you want to
+  is based on a multi-task architecture for intent classification and entity recognition. However, if you want to
   get the same model behaviour as the current ``CRFEntityExtractor``, you can use the following configuration:
 
   .. code-block:: yaml
@@ -128,6 +131,24 @@ General
   ``DIETClassifier``. For more information about the ``DIETClassifier`` and the ``LexicalSyntacticFeaturizer``
   see :ref:`components`.
 
+- ``ResponseSelector`` is now deprecated and will be replaced by ``DIETSelector`` in the future. If you want to
+  get the same model behaviour as the current ``ResponseSelector``, you can use the following configuration of
+  ``DIETSelector``:
+
+  .. code-block:: yaml
+
+    pipeline:
+    - ... # other components
+    - name: DIETSelector
+      intent_classification: True
+      entity_recognition: False
+      use_masked_language_model: False
+      BILOU_flag: False
+      number_of_transformer_layers: 0
+      ... # any other parameters
+
+  See :ref:`diet-selector` for more information about the new component.
+
 .. _migration-to-rasa-1.7:
 
 Rasa 1.6 to Rasa 1.7
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 650f34890349..bc57f38e3143 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1549,7 +1549,13 @@ DIETClassifier
         }
 
 :Description:
-    TODO
+    DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
+    recognition. The architecture is based on a transformer which is shared for both tasks.
+    A sequence of entity labels is predicted through a Conditional Random Field (CRF) tagging layer on top of the
+    transformer output sequence corresponding to the input sequence of tokens.
+    The transformer output for the ``__CLS__`` token and intent labels are embedded into a single semantic vector
+    space. We use the dot-product loss to maximize the similarity with the target label and minimize
+    similarities with negative samples.
 
     .. note:: If during prediction time a message contains **only** words unseen during training
               and no Out-Of-Vacabulary preprocessor was used,

From b33a6a7610e10dabc65bdd44fdff4cccf73e703c Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Wed, 19 Feb 2020 11:36:25 +0100
Subject: [PATCH 442/633] Update rasa/nlu/components.py

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 rasa/nlu/components.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 9e1795ed1064..5e5b67127227 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -251,7 +251,8 @@ def name(self):
     # provided properties from the previous components.
     requires = []
 
-    # Which components are required by this component
+    # Which components are required by this component. Listed components should appear before 
+    # the component itself in the pipeline.
     required_components = []
 
     # Defines the default configuration parameters of a component

From df2b407e09550bdae611308c7c1194099a94424b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 11:38:22 +0100
Subject: [PATCH 443/633] black

---
 rasa/nlu/components.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 5e5b67127227..b26cb7c2cad4 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -251,8 +251,8 @@ def name(self):
     # provided properties from the previous components.
     requires = []
 
-    # Which components are required by this component. Listed components should appear before 
-    # the component itself in the pipeline.
+    # Which components are required by this component.
+    # Listed components should appear before the component itself in the pipeline.
     required_components = []
 
     # Defines the default configuration parameters of a component

From 6a3533cac5e5f54246b377a14c10546f626642e1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:39:39 +0100
Subject: [PATCH 444/633] add missing random seed

---
 rasa/core/policies/keras_policy.py | 4 ++--
 tests/nlu/conftest.py              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 25a786b8f425..20ba85885db6 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -14,7 +14,7 @@
     MaxHistoryTrackerFeaturizer,
     BinarySingleStateFeaturizer,
 )
-from rasa.constants import DOCS_BASE_URL
+from rasa.constants import DOCS_URL_POLICIES
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
@@ -75,7 +75,7 @@ def __init__(
             "'KerasPolicy' is deprecated and will be removed in version "
             "2.0. Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs=f"{DOCS_URL_POLICIES}",
+            docs=DOCS_URL_POLICIES,
         )
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 907be110241b..b19320e18717 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -106,7 +106,7 @@ def pretrained_embeddings_convert_config() -> RasaNLUModelConfig:
             "pipeline": [
                 {"name": "ConveRTTokenizer"},
                 {"name": "ConveRTFeaturizer"},
-                {"name": "EmbeddingIntentClassifier", EPOCHS: 3},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 3, RANDOM_SEED: 42},
             ],
         }
     )

From 73337079ebaef2e154e701c78c653a05be77959c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:47:47 +0100
Subject: [PATCH 445/633] review comments

---
 .../nlu/extractors/test_crf_entity_extractor.py  |  1 +
 tests/nlu/extractors/test_entity_synonyms.py     | 16 +++++++++++++++-
 tests/nlu/extractors/test_synonyms.py            | 15 ---------------
 3 files changed, 16 insertions(+), 16 deletions(-)
 delete mode 100644 tests/nlu/extractors/test_synonyms.py

diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index 89624008f4d9..f581ae36536d 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -50,6 +50,7 @@ def test_crf_extractor(spacy_nlp):
                 ["low", "title", "upper", "pos", "pos2"],
             ],
             RANDOM_SEED: 1,
+            EPOCHS: 75,
         }
     )
     tokenizer = WhitespaceTokenizer()
diff --git a/tests/nlu/extractors/test_entity_synonyms.py b/tests/nlu/extractors/test_entity_synonyms.py
index 38df9fc9b81f..3d5230166b79 100644
--- a/tests/nlu/extractors/test_entity_synonyms.py
+++ b/tests/nlu/extractors/test_entity_synonyms.py
@@ -1,5 +1,19 @@
+from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.training_data import TrainingData, Message
-from tests.nlu import utilities
+
+
+def test_entity_synonyms():
+    entities = [
+        {"entity": "test", "value": "chines", "start": 0, "end": 6},
+        {"entity": "test", "value": "chinese", "start": 0, "end": 6},
+        {"entity": "test", "value": "china", "start": 0, "end": 6},
+    ]
+    ent_synonyms = {"chines": "chinese", "NYC": "New York City"}
+    EntitySynonymMapper(synonyms=ent_synonyms).replace_synonyms(entities)
+    assert len(entities) == 3
+    assert entities[0]["value"] == "chinese"
+    assert entities[1]["value"] == "chinese"
+    assert entities[2]["value"] == "china"
 
 
 def test_unintentional_synonyms_capitalized(
diff --git a/tests/nlu/extractors/test_synonyms.py b/tests/nlu/extractors/test_synonyms.py
deleted file mode 100644
index 8bb48e62a442..000000000000
--- a/tests/nlu/extractors/test_synonyms.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
-
-
-def test_entity_synonyms():
-    entities = [
-        {"entity": "test", "value": "chines", "start": 0, "end": 6},
-        {"entity": "test", "value": "chinese", "start": 0, "end": 6},
-        {"entity": "test", "value": "china", "start": 0, "end": 6},
-    ]
-    ent_synonyms = {"chines": "chinese", "NYC": "New York City"}
-    EntitySynonymMapper(synonyms=ent_synonyms).replace_synonyms(entities)
-    assert len(entities) == 3
-    assert entities[0]["value"] == "chinese"
-    assert entities[1]["value"] == "chinese"
-    assert entities[2]["value"] == "china"

From 30304f247b473b357c02173acffd21b6a332d5df Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 11:52:14 +0100
Subject: [PATCH 446/633] remove not needed config

---
 tests/nlu/test_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index 55829972f019..f8ac3176d115 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -302,7 +302,7 @@ def test_run_cv_evaluation(pretrained_embeddings_spacy_config):
     assert len(entity_results.test["CRFEntityExtractor"]["F1-score"]) == n_folds
 
 
-def test_run_cv_evaluation_with_response_selector(supervised_embeddings_config):
+def test_run_cv_evaluation_with_response_selector():
     training_data_obj = training_data.load_data("data/examples/rasa/demo-rasa.md")
     training_data_responses_obj = training_data.load_data(
         "data/examples/rasa/demo-rasa-responses.md"

From bed3a5642489bc4651b4d189b0fc41303e48b2fb Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 13:22:13 +0100
Subject: [PATCH 447/633] add docstrings

---
 rasa/nlu/components.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index b26cb7c2cad4..1fa68425daf7 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -28,7 +28,7 @@ def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
 
 
 def validate_requirements(component_names: List[Text]) -> None:
-    """Ensures that all required importable python packages are installed."""
+    """Validates that all required importable python packages are installed."""
 
     from rasa.nlu import registry
 
@@ -52,6 +52,8 @@ def validate_requirements(component_names: List[Text]) -> None:
 
 
 def validate_tokenizers(pipeline: List["Component"]) -> None:
+    """Validates that only one tokenizer is present in the pipeline."""
+
     from rasa.nlu.tokenizers.tokenizer import Tokenizer
 
     tokenizer_names = []
@@ -67,6 +69,8 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
 
 
 def validate_required_components(pipeline: List["Component"]) -> None:
+    """Validates that all required components are present in the pipeline."""
+
     unique_component_names = set()
     for component in pipeline:
         unique_component_names.add(component.name)
@@ -82,10 +86,7 @@ def validate_arguments(
     context: Dict[Text, Any],
     allow_empty_pipeline: bool = False,
 ) -> None:
-    """Validates a pipeline before it is run.
-
-    Ensures, that all arguments are present to train the pipeline.
-    """
+    """Validates that all arguments are present to train the pipeline."""
 
     # Ensure the pipeline is not empty
     if not allow_empty_pipeline and len(pipeline) == 0:
@@ -143,7 +144,8 @@ def validate_requires_any_of(
 
 def validate_required_components_from_data(
     pipeline: List["Component"], data: TrainingData
-):
+) -> None:
+    """Validates that all components are present in the pipeline based on data."""
 
     response_selector_exists = False
     for component in pipeline:

From 8071919623971c7bfe3d7c1b2b19e01f61a356fb Mon Sep 17 00:00:00 2001
From: Evgeniia Razumovskaia <evgeniar@yahoo.com>
Date: Wed, 19 Feb 2020 14:04:08 +0100
Subject: [PATCH 448/633] pbar printing mistake

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index e2bc527302b8..e1b19332b230 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -221,6 +221,7 @@ def train(
                     )
 
                 batch_start_index += batch_size
+        print('\n')
 
     def process(self, message: Message, **kwargs: Any) -> None:
 

From e3a3932c21d678e5aa394cb19621104ec4e87788 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 13:40:14 +0100
Subject: [PATCH 449/633] use tedpolicy as default

---
 changelog/4817.improvement.rst      | 4 +++-
 changelog/699.misc.rst              | 4 ++--
 docs/migration-guide.rst            | 8 ++++----
 docs/nlu/choosing-a-pipeline.rst    | 8 +++++---
 rasa/cli/initial_project/config.yml | 2 +-
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/changelog/4817.improvement.rst b/changelog/4817.improvement.rst
index 71fb53e94139..7c310bc073e1 100644
--- a/changelog/4817.improvement.rst
+++ b/changelog/4817.improvement.rst
@@ -1,2 +1,4 @@
 Part of Slack sanitization: 
-Multiple garbled URL's in a string coming from slack will be converted into actual strings. ``Example: health check of <http://eemdb.net|eemdb.net> and <http://eemdb1.net|eemdb1.net> to health check of eemdb.net and eemdb1.net``
+Multiple garbled URL's in a string coming from slack will be converted into actual strings.
+``Example: health check of <http://eemdb.net|eemdb.net> and <http://eemdb1.net|eemdb1.net> to health check of
+eemdb.net and eemdb1.net``
diff --git a/changelog/699.misc.rst b/changelog/699.misc.rst
index 5677d8bc05aa..2fe612da3afd 100644
--- a/changelog/699.misc.rst
+++ b/changelog/699.misc.rst
@@ -1,6 +1,6 @@
-The `EmbeddingPolicy <https://rasa.com/docs/rasa/core/policies/#embedding-policy>`_
+The `TEDPolicy <https://rasa.com/docs/rasa/core/policies/#ted-policy>`_
 replaces the ``KerasPolicy`` in new Rasa projects generated with ``rasa init``.
-The `EmbeddingPolicy <https://rasa.com/docs/rasa/core/policies/#embedding-policy>`_
+The `TEDPolicy <https://rasa.com/docs/rasa/core/policies/#ted-policy>`_
 is now the recommended machine learning policy. Please see the
 `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_
 if you want to switch to this new policy in an existing project.
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 0cbd42311df4..ea02ded348a5 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -18,23 +18,23 @@ Rasa 1.7 to Rasa 1.8
 
 General
 ~~~~~~~
-- The :ref:`embedding_policy` replaced the :ref:`keras_policy` as recommended machine
+- The :ref:`ted_policy` replaced the :ref:`keras_policy` as recommended machine
   learning policy. New projects generated with ``rasa init`` will automatically use
   this policy. In case you want to change your existing model configuration to use the
-  :ref:`embedding_policy` add this to the ``policies`` section in your ``config.yml``
+  :ref:`ted_policy` add this to the ``policies`` section in your ``config.yml``
   and remove potentially existing ``KerasPolicy`` entries:
 
   .. code-block:: yaml
 
     policies:
     - ... # other policies
-    - name: EmbeddingPolicy
+    - name: TEDPolicy
       max_history: 5
       epochs: 100
 
   The given snippet specifies default values for the parameters ``max_history`` and
   ``epochs``. ``max_history`` is particularly important and strongly depends on your stories.
-  Please see the docs of the :ref:`embedding_policy` if you want to customize them.
+  Please see the docs of the :ref:`ted_policy` if you want to customize them.
 
 - All pre-defined pipeline templates are deprecated. Take a look at :ref:`choosing-a-pipeline`
   to decide on what components you should use in your configuration file.
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index f51081b002c4..2082e44ac05f 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -1,6 +1,4 @@
-:desc: Set up a pipeline of pre-trained word vectors form GloVe or fastText
-       or fit them specifically on your dataset using the TensorFlow pipeline
-       for open source NLU.
+:desc: Set up a pipeline of pre-trained components.
 
 .. _choosing-a-pipeline:
 
@@ -15,6 +13,10 @@ it on your dataset.
 .. contents::
    :local:
 
+.. note::
+    We deprecated all existing pipeline templates, e.g. ``supervised_embeddings``, ``pretrained_embeddings_convert``
+    and ``pretrained_embeddings_spacy``. Please, list any components you want to use directly in the configuration
+    file.
 
 The Short Answer
 ----------------
diff --git a/rasa/cli/initial_project/config.yml b/rasa/cli/initial_project/config.yml
index 8e95024ee7af..f1088d9a442a 100644
--- a/rasa/cli/initial_project/config.yml
+++ b/rasa/cli/initial_project/config.yml
@@ -18,7 +18,7 @@ pipeline:
 # https://rasa.com/docs/rasa/core/policies/
 policies:
   - name: MemoizationPolicy
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 100
   - name: MappingPolicy

From 19b3659cc725ff421d90d39540851f88e5e86aa6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 14:51:36 +0100
Subject: [PATCH 450/633] update choosing a pipeline

---
 docs/nlu/choosing-a-pipeline.rst | 400 ++++++++++++++++++++-----------
 1 file changed, 259 insertions(+), 141 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 2082e44ac05f..acd92025b882 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -13,7 +13,7 @@ it on your dataset.
 .. contents::
    :local:
 
-.. note::
+.. warning::
     We deprecated all existing pipeline templates, e.g. ``supervised_embeddings``, ``pretrained_embeddings_convert``
     and ``pretrained_embeddings_spacy``. Please, list any components you want to use directly in the configuration
     file.
@@ -21,72 +21,125 @@ it on your dataset.
 The Short Answer
 ----------------
 
-If your training data is in english, a good starting point is using ``pretrained_embeddings_convert`` pipeline.
-
-.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_convert.yml
-    :language: yaml
+If your training data is in english, a good starting point is the following pipeline:
 
-In case your training data is multi-lingual and is rich with domain specific vocabulary,
-use the ``supervised_embeddings`` pipeline:
+.. code-block:: yaml
 
-.. literalinclude:: ../../data/test_config/config_supervised_embeddings.yml
-    :language: yaml
+    language: "en"
 
+    pipeline:
+      - name: ConveRTTokenizer
+      - name: ConveRTFeaturizer
+      - name: RegexFeaturizer
+      - name: LexicalSyntacticFeaturizer
+      - name: CountVectorsFeaturizer
+      - name: CountVectorsFeaturizer
+        analyzer: "char_wb"
+        min_ngram: 1
+        max_ngram: 4
+      - name: DIETClassifier
+      - name: EntitySynonymMapper
+      - name: DIETSelector
 
-A Longer Answer
----------------
 
-The three most important pipelines are ``supervised_embeddings``, ``pretrained_embeddings_convert`` and ``pretrained_embeddings_spacy``.
-The ``pretrained_embeddings_spacy`` pipeline uses pre-trained
-word vectors from either GloVe or fastText, whereas ``pretrained_embeddings_convert`` uses a pretrained sentence encoding model `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ to
-extract vector representations of complete user utterance as a whole. On the other hand, the ``supervised_embeddings`` pipeline
-doesn't use any pre-trained word vectors or sentence vectors, but instead fits these specifically for your dataset.
+In case your training data is multi-lingual and is rich with domain specific vocabulary,
+use the following pipeline:
 
-.. note::
-    These recommendations are highly dependent on your dataset and hence approximate. We suggest experimenting with different pipelines to train the best model.
+.. code-block:: yaml
 
-pretrained_embeddings_spacy
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    language: "en"
 
-The advantage of ``pretrained_embeddings_spacy`` pipeline is that if you have a training example like:
-"I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model
-already knows that the words "apples" and "pears" are very similar. This is especially useful
-if you don't have large enough training data.
+    pipeline:
+      - name: WhitespaceTokenizer
+      - name: RegexFeaturizer
+      - name: LexicalSyntacticFeaturizer
+      - name: CountVectorsFeaturizer
+      - name: CountVectorsFeaturizer
+        analyzer: "char_wb"
+        min_ngram: 1
+        max_ngram: 4
+      - name: DIETClassifier
+      - name: EntitySynonymMapper
+      - name: DIETSelector
 
 
-pretrained_embeddings_convert
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    .. warning::
-        Since ``ConveRT`` model is trained only on an **English** corpus of conversations, this pipeline should only be used if your training data is in English language.
+A Longer Answer
+---------------
 
+We encourage everyone to define their own pipeline by listing the names of the components you want to use.
+For example:
 
-This pipeline uses `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model to extract vector representation of a sentence and feeds them to ``EmbeddingIntentClassifier`` for intent classification.
-The advantage of using ``pretrained_embeddings_convert`` pipeline is that it doesn't treat each word of the user message independently,
-but creates a contextual vector representation for the complete sentence. For example, if you have a training example, like:
-"can I book a car?", and Rasa is asked to predict the intent for "I need a ride from my place", since the contextual vector representation for both
-examples are already very similar, the intent classified for both is highly likely to be the same. This is also useful if you don't have
-large enough training data.
+.. code-block:: yaml
 
-    .. note::
-        To use ``pretrained_embeddings_convert`` pipeline, you should install Rasa with ``pip install rasa[convert]``. Please also note that one of the dependencies(``tensorflow-text``) is currently only supported on Linux platforms.
+    pipeline:
+      - name: WhitespaceTokenizer
+      - name: RegexFeaturizer
+      - name: LexicalSyntacticFeaturizer
+      - name: CountVectorsFeaturizer
+      - name: CountVectorsFeaturizer
+        analyzer: "char_wb"
+        min_ngram: 1
+        max_ngram: 4
+      - name: DIETClassifier
+      - name: EntitySynonymMapper
+      - name: DIETSelector
 
-supervised_embeddings
-~~~~~~~~~~~~~~~~~~~~~
+You can find the details of each component in :ref:`components`.
+If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
 
-The advantage of the ``supervised_embeddings`` pipeline is that your word vectors will be customised
-for your domain. For example, in general English, the word "balance" is closely related to "symmetry",
-but very different to the word "cash". In a banking domain, "balance" and "cash" are closely related
-and you'd like your model to capture that. This pipeline doesn't use a language-specific model,
-so it will work with any language that you can tokenize (on whitespace or using a custom tokenizer).
+A pipeline usually consist of three main parts:
 
-You can read more about this topic `here <https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8>`__ .
+    1. Tokenizaion
+    2. Featuirzation
+    3. Entity Recognition / Intent Classification / Response Selectors
 
-MITIE
-~~~~~
+Tokenization
+~~~~~~~~~~~~
+If your chosen language is whitespace-tokenized (words are separated by spaces), you
+can use the ``WhitespaceTokenizer``. If this is not the case you should use a different tokenizer.
+We support a number of different :ref:`tokenizers <tokenizers>`, or you can :ref:`create your own <custom-nlu-components>`.
 
-You can also use MITIE as a source of word vectors in your pipeline, see :ref:`section_mitie_pipeline`. The MITIE backend performs well for small datasets, but training can take very long if you have more than a couple of hundred examples.
+.. note::
+    Some components further down the pipeline may require a specific tokenizer. You can find those requirements
+    on the individual components in :ref:`components`. If a required component is missing inside the pipeline, an
+    error will be thrown.
+
+Featurization
+~~~~~~~~~~~~~
+You need to decide whether to use components that provide pre-trained word embeddings or not.
+
+If you do not use any pre-trained word embeddings, your word vectors will be customised for your domain. For example,
+in general English, the word "balance" is closely related to "symmetry", but very different to the word "cash". In a
+banking domain, "balance" and "cash" are closely related and you'd like your model to capture that. If you don't
+use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language and domain.
+Thus, you should only use featurizers from the category `sparse` featuirzers, such as
+``CountVectorsFeaturizer`` or ``RegexFeaturizer``.
+
+The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:
+"I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model already knows that the
+words "apples" and "pears" are very similar. This is especially useful if you don't have large enough training data.
+We support a few components that provide pre-trained word embeddings:
+
+1. ``MitieFeaturizer``
+2. ``SpacyFeaturizer``
+3. ``ConveRTFeaturizer``
+4. ``LanguageModelFeaturizer``
+
+The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
+creates a contextual vector representation for the complete sentence. For example, if you
+have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
+my place", since the contextual vector representation for both examples are already very similar, the intent classified
+for both is highly likely to be the same. This is also useful if you don't have large enough training data.
+
+TODO when to use what featurizer
+
+Entity Recognition / Intent Classification / Response Selectors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Depending on your data you may want to only perform intent classification or just entity recognition.
+We support several components for each of the task. All of them are listed in :ref:`components`.
+We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`response-selector`
+for response selection.
 
-However, we do not recommend that you use it as mitie support is likely to be deprecated in a future release.
 
 Comparing different pipelines for your data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -106,7 +159,7 @@ Class imbalance
 
 Classification algorithms often do not perform well if there is a large `class imbalance`,
 for example if you have a lot of training data for some intents and very little training data for others.
-To mitigate this problem, rasa's ``supervised_embeddings`` pipeline uses a ``balanced`` batching strategy.
+To mitigate this problem, you can use a ``balanced`` batching strategy.
 This algorithm ensures that all classes are represented in every batch, or at least in
 as many subsequent batches as possible, still mimicking the fact that some classes are more frequent than others.
 Balanced batching is used by default. In order to turn it off and use a classic batching strategy include
@@ -117,22 +170,24 @@ Balanced batching is used by default. In order to turn it off and use a classic
     language: "en"
 
     pipeline:
-    - name: "CountVectorsFeaturizer"
-    - name: "EmbeddingIntentClassifier"
+    - ... # other components
+    - name: "DIETClassifier"
       batch_strategy: sequence
 
 
 Multiple Intents
 ----------------
 
-If you want to split intents into multiple labels,
-e.g. for predicting multiple intents or for modeling hierarchical intent structure,
-you can only do this with the supervised embeddings pipeline.
-To do this, use these flags in ``Whitespace Tokenizer``:
+If you want to split intents into multiple labels, e.g. for predicting multiple intents or for modeling hierarchical
+intent structure, you need to use :ref:`diet-classifier` in your pipeline.
+To do this, use these flags in any tokenizer:
 
-    - ``intent_split_symbol``: sets the delimiter string to split the intent labels. Default ``_``
+    - ``intent_tokenization_flag``: indicates whether to tokenize intent labels or not. By default this flag is set to
+      ``False``, intent will not be tokenized.
+    - ``intent_split_symbol``: sets the delimiter string to split the intent labels. Default ``_``.
 
-`Here <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__ is a tutorial on how to use multiple intents in Rasa Core and NLU.
+`Here <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__ is a
+tutorial on how to use multiple intents in Rasa.
 
 Here's an example configuration:
 
@@ -142,23 +197,24 @@ Here's an example configuration:
 
     pipeline:
     - name: "WhitespaceTokenizer"
+      intent_tokenization_flag: True
       intent_split_symbol: "_"
     - name: "CountVectorsFeaturizer"
-    - name: "EmbeddingIntentClassifier"
+    - name: "DIETClassifier"
 
 
 Understanding the Rasa NLU Pipeline
 -----------------------------------
 
 In Rasa NLU, incoming messages are processed by a sequence of components.
-These components are executed one after another
-in a so-called processing pipeline. There are components for entity extraction, for intent classification, response selection,
+These components are executed one after another in a so-called processing pipeline.
+There are components for entity extraction, for intent classification, response selection,
 pre-processing, and others. If you want to add your own component, for example to run a spell-check or to
 do sentiment analysis, check out :ref:`custom-nlu-components`.
 
 Each component processes the input and creates an output. The output can be used by any component that comes after
 this component in the pipeline. There are components which only produce information that is used by other components
-in the pipeline and there are other components that produce ``Output`` attributes which will be returned after
+in the pipeline and there are other components that produce ``output`` attributes which will be returned after
 the processing has finished. For example, for the sentence ``"I am looking for Chinese food"`` the output is:
 
 .. code-block:: json
@@ -166,7 +222,14 @@ the processing has finished. For example, for the sentence ``"I am looking for C
     {
         "text": "I am looking for Chinese food",
         "entities": [
-            {"start": 8, "end": 15, "value": "chinese", "entity": "cuisine", "extractor": "CRFEntityExtractor", "confidence": 0.864}
+            {
+                "start": 8,
+                "end": 15,
+                "value": "chinese",
+                "entity": "cuisine",
+                "extractor": "DIETClassifier",
+                "confidence": 0.864
+            }
         ],
         "intent": {"confidence": 0.6485910906220309, "name": "restaurant_search"},
         "intent_ranking": [
@@ -175,18 +238,35 @@ the processing has finished. For example, for the sentence ``"I am looking for C
         ]
     }
 
-This is created as a combination of the results of the different components in the pre-configured pipeline ``pretrained_embeddings_spacy``.
-For example, the ``entities`` attribute is created by the ``CRFEntityExtractor`` component.
+This is created as a combination of the results of the different components in the following pipeline:
+
+.. code-block:: yaml
+
+    language: "en"
+
+    pipeline:
+      - name: WhitespaceTokenizer
+      - name: RegexFeaturizer
+      - name: LexicalSyntacticFeaturizer
+      - name: CountVectorsFeaturizer
+      - name: CountVectorsFeaturizer
+        analyzer: "char_wb"
+        min_ngram: 1
+        max_ngram: 4
+      - name: DIETClassifier
+      - name: EntitySynonymMapper
+      - name: DIETSelector
+
+For example, the ``entities`` attribute is created by the ``DIETClassifier`` component.
 
 
 .. _section_component_lifecycle:
 
 Component Lifecycle
 -------------------
-Every component can implement several methods from the ``Component``
-base class; in a pipeline these different methods
-will be called in a specific order. Lets assume, we added the following
-pipeline to our config:
+
+Every component can implement several methods from the ``Component`` base class; in a pipeline these different methods
+will be called in a specific order. Lets assume, we added the following pipeline to our config:
 ``"pipeline": ["Component A", "Component B", "Last Component"]``.
 The image shows the call order during the training of this pipeline:
 
@@ -236,20 +316,23 @@ exactly. Instead it will return the trained synonym.
 
 .. note::
 
-    The ``confidence`` will be set by the CRF entity extractor
-    (``CRFEntityExtractor`` component). The duckling entity extractor will always return
-    ``1``. The ``SpacyEntityExtractor`` extractor does not provide this information and
-    returns ``null``.
+    The ``confidence`` will be set by the ``CRFEntityExtractor`` and ``DIETClassifier`` component. The
+    ``DucklingHTTPExtractor`` will always return ``1``. The ``SpacyEntityExtractor`` extractor does not provide this
+    information and returns ``null``.
 
 
-Pre-configured Pipelines
-------------------------
+Pipeline Templates (deprecated)
+-------------------------------
 
-A template is just a shortcut for
-a full list of components. For example, these two configurations are equivalent:
+A template is just a shortcut for a full list of components. For example, these two configurations are equivalent:
 
-.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_spacy.yml
-    :language: yaml
+.. code-block:: yaml
+
+    language: "en"
+
+    pipeline: "pretrained_embeddings_spacy"
+
+and
 
 .. code-block:: yaml
 
@@ -264,58 +347,81 @@ a full list of components. For example, these two configurations are equivalent:
     - name: "EntitySynonymMapper"
     - name: "SklearnIntentClassifier"
 
-Below is a list of all the pre-configured pipeline templates with customization information.
+The three most important pipelines are ``supervised_embeddings``, ``pretrained_embeddings_convert`` and
+``pretrained_embeddings_spacy``.
+The ``pretrained_embeddings_spacy`` pipeline uses pre-trained word vectors from either GloVe or fastText,
+whereas ``pretrained_embeddings_convert`` uses a pretrained sentence encoding model
+`ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ to extract vector representations of complete user
+utterance as a whole. On the other hand, the ``supervised_embeddings`` pipeline doesn't use any pre-trained word
+vectors or sentence vectors, but instead fits these specifically for your dataset.
 
-.. _section_supervised_embeddings_pipeline:
+.. note::
+    These recommendations are highly dependent on your dataset and hence approximate. We suggest experimenting with
+    different pipelines to train the best model.
 
-supervised_embeddings
-~~~~~~~~~~~~~~~~~~~~~
+.. _section_pretrained_embeddings_spacy_pipeline:
 
-To train a Rasa model in your preferred language, define the
-``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
+pretrained_embeddings_spacy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. literalinclude:: ../../data/test_config/config_supervised_embeddings.yml
-    :language: yaml
+The advantage of ``pretrained_embeddings_spacy`` pipeline is that if you have a training example like:
+"I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model
+already knows that the words "apples" and "pears" are very similar. This is especially useful
+if you don't have large enough training data.
 
-The ``supervised_embeddings`` pipeline supports any language that can be tokenized.  By default it uses whitespace
-for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are the default
-components that make up the ``supervised_embeddings`` pipeline:
+To use the ``pretrained_embeddings_spacy`` template, use the following configuration:
+
+.. code-block:: yaml
+
+    language: "en"
+
+    pipeline: "pretrained_embeddings_spacy"
+
+See :ref:`pretrained-word-vectors` for more information about loading spacy language models.
+To use the components and configure them separately:
 
 .. code-block:: yaml
 
     language: "en"
 
     pipeline:
-    - name: "WhitespaceTokenizer"
+    - name: "SpacyNLP"
+    - name: "SpacyTokenizer"
+    - name: "SpacyFeaturizer"
     - name: "RegexFeaturizer"
     - name: "CRFEntityExtractor"
     - name: "EntitySynonymMapper"
-    - name: "CountVectorsFeaturizer"
-    - name: "CountVectorsFeaturizer"
-      analyzer: "char_wb"
-      min_ngram: 1
-      max_ngram: 4
-    - name: "EmbeddingIntentClassifier"
-    
-So for example, if your chosen language is not whitespace-tokenized (words are not separated by spaces), you
-can replace the ``WhitespaceTokenizer`` with your own tokenizer. We support a number of different :ref:`tokenizers <tokenizers>`,
-or you can :ref:`create your own <custom-nlu-components>`.
-
-The pipeline uses two instances of ``CountVectorsFeaturizer``. The first one 
-featurizes text based on words. The second one featurizes text based on character 
-n-grams, preserving word boundaries. We empirically found the second featurizer 
-to be more powerful, but we decided to keep the first featurizer as well to make
-featurization more robust.
+    - name: "SklearnIntentClassifier"
 
 .. _section_pretrained_embeddings_convert_pipeline:
 
 pretrained_embeddings_convert
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+    .. warning::
+        Since ``ConveRT`` model is trained only on an **English** corpus of conversations, this pipeline should only
+        be used if your training data is in English language.
+
+This pipeline uses `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model to extract vector representation of
+a sentence and feeds them to ``EmbeddingIntentClassifier`` for intent classification.
+The advantage of using ``pretrained_embeddings_convert`` pipeline is that it doesn't treat each word of the user
+message independently, but creates a contextual vector representation for the complete sentence. For example, if you
+have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
+my place", since the contextual vector representation for both examples are already very similar, the intent classified
+for both is highly likely to be the same. This is also useful if you don't have large enough training data.
+
+    .. note::
+        To use ``pretrained_embeddings_convert`` pipeline, you should install Rasa with ``pip install rasa[convert]``.
+        Please also note that one of the dependencies(``tensorflow-text``) is currently only supported on Linux
+        platforms.
+
 To use the ``pretrained_embeddings_convert`` template:
 
-.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_convert.yml
-    :language: yaml
+.. code-block:: yaml
+
+    language: "en"
+
+    pipeline: "pretrained_embeddings_convert"
 
 To use the components and configure them separately:
 
@@ -328,37 +434,69 @@ To use the components and configure them separately:
     - name: "ConveRTFeaturizer"
     - name: "EmbeddingIntentClassifier"
 
-.. _section_pretrained_embeddings_spacy_pipeline:
+.. _section_supervised_embeddings_pipeline:
 
-pretrained_embeddings_spacy
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+supervised_embeddings
+~~~~~~~~~~~~~~~~~~~~~
 
-To use the ``pretrained_embeddings_spacy`` template:
+The advantage of the ``supervised_embeddings`` pipeline is that your word vectors will be customised
+for your domain. For example, in general English, the word "balance" is closely related to "symmetry",
+but very different to the word "cash". In a banking domain, "balance" and "cash" are closely related
+and you'd like your model to capture that. This pipeline doesn't use a language-specific model,
+so it will work with any language that you can tokenize (on whitespace or using a custom tokenizer).
 
-.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_spacy.yml
-    :language: yaml
+You can read more about this topic `here <https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8>`__ .
 
-See :ref:`pretrained-word-vectors` for more information about loading spacy language models.
-To use the components and configure them separately:
+To train a Rasa model in your preferred language, define the
+``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
+
+.. code-block:: yaml
+
+    language: "en"
+
+    pipeline: "supervised_embeddings"
+
+The ``supervised_embeddings`` pipeline supports any language that can be tokenized.  By default it uses whitespace
+for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are the default
+components that make up the ``supervised_embeddings`` pipeline:
 
 .. code-block:: yaml
 
     language: "en"
 
     pipeline:
-    - name: "SpacyNLP"
-    - name: "SpacyTokenizer"
-    - name: "SpacyFeaturizer"
+    - name: "WhitespaceTokenizer"
     - name: "RegexFeaturizer"
     - name: "CRFEntityExtractor"
     - name: "EntitySynonymMapper"
-    - name: "SklearnIntentClassifier"
+    - name: "CountVectorsFeaturizer"
+    - name: "CountVectorsFeaturizer"
+      analyzer: "char_wb"
+      min_ngram: 1
+      max_ngram: 4
+    - name: "EmbeddingIntentClassifier"
+    
+So for example, if your chosen language is not whitespace-tokenized (words are not separated by spaces), you
+can replace the ``WhitespaceTokenizer`` with your own tokenizer. We support a number of different :ref:`tokenizers <tokenizers>`,
+or you can :ref:`create your own <custom-nlu-components>`.
+
+The pipeline uses two instances of ``CountVectorsFeaturizer``. The first one 
+featurizes text based on words. The second one featurizes text based on character 
+n-grams, preserving word boundaries. We empirically found the second featurizer 
+to be more powerful, but we decided to keep the first featurizer as well to make
+featurization more robust.
 
 .. _section_mitie_pipeline:
 
 MITIE
 ~~~~~
 
+You can also use MITIE as a source of word vectors in your pipeline.
+The MITIE backend performs well for small datasets, but training can take very long if you have more than a couple
+of hundred examples.
+
+However, we do not recommend that you use it as mitie support is likely to be deprecated in a future release.
+
 To use the MITIE pipeline, you will have to train word vectors from a corpus. Instructions can be found
 :ref:`here <mitie>`. This will give you the file path to pass to the ``model`` parameter.
 
@@ -369,24 +507,4 @@ Another version of this pipeline uses MITIE's featurizer and also its multi-clas
 Training can be quite slow, so this is not recommended for large datasets.
 
 .. literalinclude:: ../../data/test_config/config_pretrained_embeddings_mitie_2.yml
-    :language: yaml
-
-
-Custom pipelines
-----------------
-
-You don't have to use a template, you can also run a fully custom pipeline
-by listing the names of the components you want to use:
-
-.. code-block:: yaml
-
-    pipeline:
-    - name: "SpacyNLP"
-    - name: "CRFEntityExtractor"
-    - name: "EntitySynonymMapper"
-
-This creates a pipeline that only does entity recognition, but no
-intent classification. So Rasa NLU will not predict any intents.
-You can find the details of each component in :ref:`components`.
-
-If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
+    :language: yaml
\ No newline at end of file

From d930dcb3781158325c9cec515d1ee8ff4d9f8aa4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 15:20:47 +0100
Subject: [PATCH 451/633] switch to old crf

---
 rasa/nlu/extractors/crf_entity_extractor.py | 740 ++++++++++++++++----
 1 file changed, 596 insertions(+), 144 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 45ac7abf05fb..dd104723a6eb 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -1,61 +1,69 @@
 import logging
 import os
-import warnings
-from typing import Any, Dict, Optional, Text
+import typing
+import numpy as np
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
 
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
-    LexicalSyntacticFeaturizer,
-)
-from rasa.constants import DOCS_URL_COMPONENTS
+from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
+from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.training_data import TrainingData, Message
-from rasa.constants import DOCS_BASE_URL
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier
-from rasa.nlu.constants import TEXT, ENTITIES, TOKENS_NAMES
-from rasa.utils.tensorflow.constants import (
-    HIDDEN_LAYERS_SIZES,
-    NUM_TRANSFORMER_LAYERS,
-    BATCH_SIZES,
-    EPOCHS,
-    RANDOM_SEED,
-    LEARNING_RATE,
-    DENSE_DIM,
-    SPARSE_INPUT_DROPOUT,
-    MASKED_LM,
-    ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
-    EVAL_NUM_EXAMPLES,
-    EVAL_NUM_EPOCHS,
-    DROPRATE,
-    REGULARIZATION_CONSTANT,
-    BILOU_FLAG,
-    BATCH_STRATEGY,
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    TEXT,
+    DENSE_FEATURE_NAMES,
+    SPACY_DOCS,
+    ENTITIES,
+)
+from rasa.constants import (
+    DOCS_BASE_URL,
+    DOCS_URL_TRAINING_DATA_NLU,
+    DOCS_URL_COMPONENTS,
 )
 from rasa.utils.common import raise_warning
-from rasa.utils.tensorflow.models import RasaModel
+
+try:
+    import spacy
+except ImportError:
+    spacy = None
 
 logger = logging.getLogger(__name__)
 
+if typing.TYPE_CHECKING:
+    from sklearn_crfsuite import CRF
+    from spacy.tokens import Doc
+
 
-class CRFEntityExtractor(DIETClassifier):
+class CRFToken(NamedTuple):
+    text: Text
+    tag: Text
+    entity: Text
+    pattern: Dict[Text, Any]
+    dense_features: np.ndarray
+
+
+class CRFEntityExtractor(EntityExtractor):
 
     provides = [ENTITIES]
 
     requires = [TOKENS_NAMES[TEXT]]
 
-    # please make sure to update the docs when changing a default parameter
     defaults = {
-        # 'features' is [before, word, after] array with before, word,
-        # after holding keys about which features to use for each word,
-        # for example, 'title' in array before will have the feature
+        # BILOU_flag determines whether to use BILOU tagging or not.
+        # More rigorous however requires more examples per entity
+        # rule of thumb: use only if more than 100 egs. per entity
+        "BILOU_flag": True,
+        # crf_features is [before, word, after] array with before, word,
+        # after holding keys about which
+        # features to use for each word, for example, 'title' in
+        # array before will have the feature
         # "is the preceding word in title case?"
-        # POS features require 'SpacyTokenizer'.
+        # POS features require spaCy to be installed
         "features": [
             ["low", "title", "upper"],
             [
-                "BOS",
-                "EOS",
+                "bias",
                 "low",
                 "prefix5",
                 "prefix2",
@@ -65,108 +73,341 @@ class CRFEntityExtractor(DIETClassifier):
                 "upper",
                 "title",
                 "digit",
+                "pattern",
             ],
             ["low", "title", "upper"],
         ],
-        # nn architecture
-        # sizes of hidden layers before the embedding layer for input words
-        # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES: {TEXT: []},
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
-        BATCH_SIZES: [64, 256],
-        # number of epochs
-        EPOCHS: 300,
-        # set random seed to any int to get reproducible results
-        RANDOM_SEED: None,
-        # optimizer
-        LEARNING_RATE: 0.001,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512},
-        # regularization parameters
-        # the scale of regularization
-        REGULARIZATION_CONSTANT: 0.002,
-        # dropout rate for rnn
-        DROPRATE: 0.2,
-        # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: True,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
-        # BILOU_flag determines whether to use BILOU tagging or not.
-        # More rigorous however requires more examples per entity
-        # rule of thumb: use only if more than 100 egs. per entity
-        BILOU_FLAG: True,
+        # The maximum number of iterations for optimization algorithms.
+        "max_iterations": 50,
+        # weight of theL1 regularization
+        "L1_c": 0.1,
+        # weight of the L2 regularization
+        "L2_c": 0.1,
+    }
+
+    function_dict = {
+        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
+        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
+        "prefix5": lambda crf_token: crf_token.text[:5],
+        "prefix2": lambda crf_token: crf_token.text[:2],
+        "suffix5": lambda crf_token: crf_token.text[-5:],
+        "suffix3": lambda crf_token: crf_token.text[-3:],
+        "suffix2": lambda crf_token: crf_token.text[-2:],
+        "suffix1": lambda crf_token: crf_token.text[-1:],
+        "pos": lambda crf_token: crf_token.tag,
+        "pos2": lambda crf_token: crf_token.tag[:2],
+        "bias": lambda crf_token: "bias",
+        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
+        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
+        "pattern": lambda crf_token: crf_token.pattern,
+        "text_dense_features": lambda crf_token: crf_token.dense_features,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
-        featurizer: Optional[LexicalSyntacticFeaturizer] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        model: Optional[RasaModel] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
+        ent_tagger: Optional["CRF"] = None,
     ) -> None:
-        component_config = component_config or {}
-
-        # the following properties cannot be adapted for the CRFEntityExtractor
-        component_config[INTENT_CLASSIFICATION] = False
-        component_config[ENTITY_RECOGNITION] = True
-        component_config[MASKED_LM] = False
-        component_config[NUM_TRANSFORMER_LAYERS] = 0
-        component_config[BATCH_STRATEGY] = "sequence"
-
-        super().__init__(
-            component_config,
-            inverted_label_dict,
-            inverted_tag_dict,
-            model,
-            batch_tuple_sizes,
-        )
 
-        self.featurizer = featurizer or LexicalSyntacticFeaturizer(
-            self.component_config
-        )
+        super().__init__(component_config)
 
-        raise_warning(
-            f"'CRFEntityExtractor' is deprecated and will be remove in version 2.0. "
-            f"Use 'DIETClassifier' in combination with 'LexicalSyntacticFeaturizer' "
-            f"instead.",
-            category=FutureWarning,
-            docs=DOCS_URL_COMPONENTS,
-        )
+        self.ent_tagger = ent_tagger
+
+        self._validate_configuration()
+
+        self._check_pos_features_and_spacy()
+
+    def _check_pos_features_and_spacy(self) -> None:
+        import itertools
+
+        features = self.component_config.get("features", [])
+        fts = set(itertools.chain.from_iterable(features))
+        self.pos_features = "pos" in fts or "pos2" in fts
+        if self.pos_features:
+            self._check_spacy()
+
+    @staticmethod
+    def _check_spacy() -> None:
+        if spacy is None:
+            raise ImportError(
+                "Failed to import `spaCy`. "
+                "`spaCy` is required for POS features "
+                "See https://spacy.io/usage/ for installation"
+                "instructions."
+            )
+
+    def _validate_configuration(self) -> None:
+        if len(self.component_config.get("features", [])) % 2 != 1:
+            raise ValueError(
+                "Need an odd number of crf feature lists to have a center word."
+            )
+
+    @classmethod
+    def required_packages(cls) -> List[Text]:
+        return ["sklearn_crfsuite", "sklearn"]
 
     def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig] = None,
-        **kwargs: Any,
+        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
 
-        if not training_data.entity_examples:
-            return
+        # checks whether there is at least one
+        # example with an entity annotation
+        if training_data.entity_examples:
+            self._check_spacy_doc(training_data.training_examples[0])
+
+            # filter out pre-trained entity examples
+            filtered_entity_examples = self.filter_trainable_entities(
+                training_data.training_examples
+            )
+
+            # convert the dataset into features
+            # this will train on ALL examples, even the ones
+            # without annotations
+            dataset = self._create_dataset(filtered_entity_examples)
+
+            self._train_model(dataset)
 
-        self.featurizer.train(training_data, **kwargs)
+    def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
+        dataset = []
 
-        super().train(training_data, config, **kwargs)
+        for example in examples:
+            entity_offsets = self._convert_example(example)
+            dataset.append(self._from_json_to_crf(example, entity_offsets))
+
+        return dataset
+
+    def _check_spacy_doc(self, message: Message) -> None:
+        if self.pos_features and message.get(SPACY_DOCS[TEXT]) is None:
+            raise InvalidConfigError(
+                "Could not find `spacy_doc` attribute for "
+                "message {}\n"
+                "POS features require a pipeline component "
+                "that provides `spacy_doc` attributes, i.e. `SpacyNLP`. "
+                "See {}/nlu/choosing-a-pipeline/#pretrained-embeddings-spacy "
+                "for details".format(message.text, DOCS_BASE_URL)
+            )
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        self.featurizer.process(message, **kwargs)
+        self._check_spacy_doc(message)
+
+        extracted = self.add_extractor_name(self.extract_entities(message))
+        message.set(
+            ENTITIES,
+            message.get(ENTITIES, []) + extracted,
+            add_to_output=True,
+        )
+
+    @staticmethod
+    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
+        def convert_entity(entity):
+            return entity["start"], entity["end"], entity["entity"]
+
+        return [convert_entity(ent) for ent in example.get(ENTITIES, [])]
+
+    def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
+        """Take a sentence and return entities in json format"""
+
+        if self.ent_tagger is not None:
+            text_data = self._from_text_to_crf(message)
+            features = self._sentence_to_features(text_data)
+            ents = self.ent_tagger.predict_marginals_single(features)
+            return self._from_crf_to_json(message, ents)
+        else:
+            return []
+
+    def most_likely_entity(self, idx: int, entities: List[Any]) -> Tuple[Text, Any]:
+        if len(entities) > idx:
+            entity_probs = entities[idx]
+        else:
+            entity_probs = None
+        if entity_probs:
+            label = max(entity_probs, key=lambda key: entity_probs[key])
+            if self.component_config["BILOU_flag"]:
+                # if we are using bilou flags, we will combine the prob
+                # of the B, I, L and U tags for an entity (so if we have a
+                # score of 60% for `B-address` and 40% and 30%
+                # for `I-address`, we will return 70%)
+                return (
+                    label,
+                    sum([v for k, v in entity_probs.items() if k[2:] == label[2:]]),
+                )
+            else:
+                return label, entity_probs[label]
+        else:
+            return "", 0.0
+
+    def _create_entity_dict(
+        self,
+        message: Message,
+        tokens: Union["Doc", List[Token]],
+        start: int,
+        end: int,
+        entity: str,
+        confidence: float,
+    ) -> Dict[Text, Any]:
+        if isinstance(tokens, list):  # tokens is a list of Token
+            _start = tokens[start].start
+            _end = tokens[end].end
+            value = tokens[start].text
+            value += "".join(
+                [
+                    message.text[tokens[i - 1].end : tokens[i].start] + tokens[i].text
+                    for i in range(start + 1, end + 1)
+                ]
+            )
+        else:  # tokens is a Doc
+            _start = tokens[start].idx
+            _end = tokens[start : end + 1].end_char
+            value = tokens[start : end + 1].text
+
+        return {
+            "start": _start,
+            "end": _end,
+            "value": value,
+            "entity": entity,
+            "confidence": confidence,
+        }
+
+    @staticmethod
+    def _entity_from_label(label) -> Text:
+        return label[2:]
+
+    @staticmethod
+    def _bilou_from_label(label) -> Optional[Text]:
+        if len(label) >= 2 and label[1] == "-":
+            return label[0].upper()
+        return None
+
+    @staticmethod
+    def _tokens_without_cls(message: Message) -> List[Token]:
+        # [:-1] to remove the CLS token from the list of tokens
+        return message.get(TOKENS_NAMES[TEXT])[:-1]
+
+    def _find_bilou_end(self, word_idx, entities) -> Any:
+        ent_word_idx = word_idx + 1
+        finished = False
+
+        # get information about the first word, tagged with `B-...`
+        label, confidence = self.most_likely_entity(word_idx, entities)
+        entity_label = self._entity_from_label(label)
 
-        super().process(message, **kwargs)
+        while not finished:
+            label, label_confidence = self.most_likely_entity(ent_word_idx, entities)
 
-    def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
+            confidence = min(confidence, label_confidence)
 
-        self.featurizer.persist(file_name + ".featurizer", model_dir)
+            if label[2:] != entity_label:
+                # words are not tagged the same entity class
+                logger.debug(
+                    "Inconsistent BILOU tagging found, B- tag, L- "
+                    "tag pair encloses multiple entity classes.i.e. "
+                    "[B-a, I-b, L-a] instead of [B-a, I-a, L-a].\n"
+                    "Assuming B- class is correct."
+                )
 
-        return super().persist(file_name, model_dir)
+            if label.startswith("L-"):
+                # end of the entity
+                finished = True
+            elif label.startswith("I-"):
+                # middle part of the entity
+                ent_word_idx += 1
+            else:
+                # entity not closed by an L- tag
+                finished = True
+                ent_word_idx -= 1
+                logger.debug(
+                    "Inconsistent BILOU tagging found, B- tag not "
+                    "closed by L- tag, i.e [B-a, I-a, O] instead of "
+                    "[B-a, L-a, O].\nAssuming last tag is L-"
+                )
+        return ent_word_idx, confidence
+
+    def _handle_bilou_label(
+        self, word_idx: int, entities: List[Any]
+    ) -> Tuple[Any, Any, Any]:
+        label, confidence = self.most_likely_entity(word_idx, entities)
+        entity_label = self._entity_from_label(label)
+
+        if self._bilou_from_label(label) == "U":
+            return word_idx, confidence, entity_label
+
+        elif self._bilou_from_label(label) == "B":
+            # start of multi word-entity need to represent whole extent
+            ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
+            return ent_word_idx, confidence, entity_label
+
+        else:
+            return None, None, None
+
+    def _from_crf_to_json(
+        self, message: Message, entities: List[Any]
+    ) -> List[Dict[Text, Any]]:
+
+        if self.pos_features:
+            tokens = message.get(SPACY_DOCS[TEXT])
+        else:
+            tokens = self._tokens_without_cls(message)
+
+        if len(tokens) != len(entities):
+            raise Exception(
+                "Inconsistency in amount of tokens between crfsuite and message"
+            )
+
+        if self.component_config["BILOU_flag"]:
+            return self._convert_bilou_tagging_to_entity_result(
+                message, tokens, entities
+            )
+        else:
+            # not using BILOU tagging scheme, multi-word entities are split.
+            return self._convert_simple_tagging_to_entity_result(tokens, entities)
+
+    def _convert_bilou_tagging_to_entity_result(
+        self, message: Message, tokens: List[Token], entities: List[Dict[Text, float]]
+    ):
+        # using the BILOU tagging scheme
+        json_ents = []
+        word_idx = 0
+        while word_idx < len(tokens):
+            end_idx, confidence, entity_label = self._handle_bilou_label(
+                word_idx, entities
+            )
+
+            if end_idx is not None:
+                ent = self._create_entity_dict(
+                    message, tokens, word_idx, end_idx, entity_label, confidence
+                )
+                json_ents.append(ent)
+                word_idx = end_idx + 1
+            else:
+                word_idx += 1
+        return json_ents
+
+    def _convert_simple_tagging_to_entity_result(
+        self, tokens: List[Union[Token, Any]], entities: List[Any]
+    ) -> List[Dict[Text, Any]]:
+        json_ents = []
+
+        for word_idx in range(len(tokens)):
+            entity_label, confidence = self.most_likely_entity(word_idx, entities)
+            word = tokens[word_idx]
+            if entity_label != "O":
+                if self.pos_features and not isinstance(word, Token):
+                    start = word.idx
+                    end = word.idx + len(word)
+                else:
+                    start = word.start
+                    end = word.end
+                ent = {
+                    "start": start,
+                    "end": end,
+                    "value": word.text,
+                    "entity": entity_label,
+                    "confidence": confidence,
+                }
+                json_ents.append(ent)
+
+        return json_ents
 
     @classmethod
     def load(
@@ -177,36 +418,247 @@ def load(
         cached_component: Optional["CRFEntityExtractor"] = None,
         **kwargs: Any,
     ) -> "CRFEntityExtractor":
+        from sklearn.externals import joblib
+
+        file_name = meta.get("file")
+        model_file = os.path.join(model_dir, file_name)
+
+        if os.path.exists(model_file):
+            ent_tagger = joblib.load(model_file)
+            return cls(meta, ent_tagger)
+        else:
+            return cls(meta)
+
+    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
+        """Persist this model into the passed directory.
+
+        Returns the metadata necessary to load the model again."""
+
+        from sklearn.externals import joblib
+
+        file_name = file_name + ".pkl"
+        if self.ent_tagger:
+            model_file_name = os.path.join(model_dir, file_name)
+            joblib.dump(self.ent_tagger, model_file_name)
+
+        return {"file": file_name}
+
+    def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any]]:
+        """Convert a word into discrete features in self.crf_features,
+        including word before and word after."""
+
+        configured_features = self.component_config["features"]
+        sentence_features = []
+
+        for word_idx in range(len(sentence)):
+            # word before(-1), current word(0), next word(+1)
+            feature_span = len(configured_features)
+            half_span = feature_span // 2
+            feature_range = range(-half_span, half_span + 1)
+            prefixes = [str(i) for i in feature_range]
+            word_features = {}
+            for f_i in feature_range:
+                if word_idx + f_i >= len(sentence):
+                    word_features["EOS"] = True
+                    # End Of Sentence
+                elif word_idx + f_i < 0:
+                    word_features["BOS"] = True
+                    # Beginning Of Sentence
+                else:
+                    word = sentence[word_idx + f_i]
+                    f_i_from_zero = f_i + half_span
+                    prefix = prefixes[f_i_from_zero]
+                    features = configured_features[f_i_from_zero]
+                    for feature in features:
+                        if feature == "pattern":
+                            # add all regexes as a feature
+                            regex_patterns = self.function_dict[feature](word)
+                            # pytype: disable=attribute-error
+                            for p_name, matched in regex_patterns.items():
+                                feature_name = prefix + ":" + feature + ":" + p_name
+                                word_features[feature_name] = matched
+                            # pytype: enable=attribute-error
+                        else:
+                            # append each feature to a feature vector
+                            value = self.function_dict[feature](word)
+                            word_features[prefix + ":" + feature] = value
+            sentence_features.append(word_features)
+        return sentence_features
 
-        if not model_dir or not meta.get("file"):
-            warnings.warn(
-                f"Failed to load 'CRFEntityExtractor'. "
-                f"Maybe the path '{os.path.abspath(model_dir)}' doesn't exist?"
+    @staticmethod
+    def _sentence_to_labels(
+        sentence: List[
+            Tuple[
+                Optional[Text],
+                Optional[Text],
+                Text,
+                Dict[Text, Any],
+                Optional[Dict[str, Any]],
+            ]
+        ],
+    ) -> List[Text]:
+
+        return [label for _, _, label, _, _ in sentence]
+
+    def _from_json_to_crf(
+        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
+    ) -> List[CRFToken]:
+        """Convert json examples to format of underlying crfsuite."""
+
+        if self.pos_features:
+            from spacy.gold import GoldParse  # pytype: disable=import-error
+
+            doc_or_tokens = message.get(SPACY_DOCS[TEXT])
+            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
+            ents = [l[5] for l in gold.orig_annot]
+        else:
+            doc_or_tokens = self._tokens_without_cls(message)
+            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
+
+        # collect badly annotated examples
+        collected = []
+        for t, e in zip(doc_or_tokens, ents):
+            if e == "-":
+                collected.append(t)
+            elif collected:
+                collected_text = " ".join([t.text for t in collected])
+                raise_warning(
+                    f"Misaligned entity annotation for '{collected_text}' "
+                    f"in sentence '{message.text}' with intent "
+                    f"'{message.get('intent')}'. "
+                    f"Make sure the start and end values of the "
+                    f"annotated training examples end at token "
+                    f"boundaries (e.g. don't include trailing "
+                    f"whitespaces or punctuation).",
+                    docs=DOCS_URL_TRAINING_DATA_NLU,
+                )
+                collected = []
+
+        if not self.component_config["BILOU_flag"]:
+            for i, label in enumerate(ents):
+                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
+                    # removes BILOU prefix from label
+                    ents[i] = self._entity_from_label(label)
+
+        return self._from_text_to_crf(message, ents)
+
+    @staticmethod
+    def _bilou_tags_from_offsets(tokens, entities, missing: Text = "O") -> List[Text]:
+        # From spacy.spacy.GoldParse, under MIT License
+        starts = {token.start: i for i, token in enumerate(tokens)}
+        ends = {token.end: i for i, token in enumerate(tokens)}
+        bilou = ["-" for _ in tokens]
+        # Handle entity cases
+        for start_char, end_char, label in entities:
+            start_token = starts.get(start_char)
+            end_token = ends.get(end_char)
+            # Only interested if the tokenization is correct
+            if start_token is not None and end_token is not None:
+                if start_token == end_token:
+                    bilou[start_token] = "U-%s" % label
+                else:
+                    bilou[start_token] = "B-%s" % label
+                    for i in range(start_token + 1, end_token):
+                        bilou[i] = "I-%s" % label
+                    bilou[end_token] = "L-%s" % label
+        # Now distinguish the O cases from ones where we miss the tokenization
+        entity_chars = set()
+        for start_char, end_char, label in entities:
+            for i in range(start_char, end_char):
+                entity_chars.add(i)
+        for n, token in enumerate(tokens):
+            for i in range(token.start, token.end):
+                if i in entity_chars:
+                    break
+            else:
+                bilou[n] = missing
+
+        return bilou
+
+    @staticmethod
+    def __pattern_of_token(message: Message, i: int) -> Dict:
+        if message.get(TOKENS_NAMES[TEXT]) is not None:
+            return message.get(TOKENS_NAMES[TEXT])[i].get("pattern", {})
+        else:
+            return {}
+
+    @staticmethod
+    def __tag_of_token(token: Any) -> Text:
+        if spacy.about.__version__ > "2" and token._.has("tag"):
+            return token._.get("tag")
+        else:
+            return token.tag_
+
+    @staticmethod
+    def __get_dense_features(message: Message) -> Optional[List[Any]]:
+        features = message.get(DENSE_FEATURE_NAMES[TEXT])
+
+        if features is None:
+            return None
+
+        tokens = message.get(TOKENS_NAMES[TEXT], [])
+        if len(tokens) != len(features):
+            raise_warning(
+                f"Number of features ({len(features)}) for attribute "
+                f"'{DENSE_FEATURE_NAMES[TEXT]}' "
+                f"does not match number of tokens ({len(tokens)}).",
+                docs=DOCS_URL_COMPONENTS + "#crfentityextractor",
             )
-            return cls(component_config=meta)
+            return None
 
-        featurizer_meta = meta.copy()
-        featurizer_meta["file"] += ".featurizer"
-        featurizer = LexicalSyntacticFeaturizer.load(
-            featurizer_meta, model_dir, model_metadata, cached_component, **kwargs
-        )
+        # convert to python-crfsuite feature format
+        features_out = []
+        for feature in features:
+            feature_dict = {
+                str(index): token_features
+                for index, token_features in enumerate(feature)
+            }
+            converted = {"text_dense_features": feature_dict}
+            features_out.append(converted)
+        return features_out
+
+    def _from_text_to_crf(
+        self, message: Message, entities: List[Text] = None
+    ) -> List[CRFToken]:
+        """Takes a sentence and switches it to crfsuite format."""
+
+        crf_format = []
+        if self.pos_features:
+            tokens = message.get(SPACY_DOCS[TEXT])
+        else:
+            tokens = self._tokens_without_cls(message)
+
+        text_dense_features = self.__get_dense_features(message)
+
+        for i, token in enumerate(tokens):
+            pattern = self.__pattern_of_token(message, i)
+            entity = entities[i] if entities else "N/A"
+            tag = self.__tag_of_token(token) if self.pos_features else None
+            dense_features = (
+                text_dense_features[i] if text_dense_features is not None else []
+            )
+
+            crf_format.append(
+                CRFToken(token.text, tag, entity, pattern, dense_features)
+            )
+
+        return crf_format
+
+    def _train_model(self, df_train: List[List[CRFToken]]) -> None:
+        """Train the crf tagger based on the training data."""
+        import sklearn_crfsuite
 
-        (
-            batch_tuple_sizes,
-            inv_label_dict,
-            inv_tag_dict,
-            label_data,
-            meta,
-            data_example,
-        ) = cls._load_from_files(meta, model_dir)
-
-        model = cls._load_model(inv_tag_dict, label_data, meta, data_example, model_dir)
-
-        return cls(
-            component_config=meta,
-            featurizer=featurizer,
-            inverted_label_dict=inv_label_dict,
-            inverted_tag_dict=inv_tag_dict,
-            model=model,
-            batch_tuple_sizes=batch_tuple_sizes,
+        X_train = [self._sentence_to_features(sent) for sent in df_train]
+        y_train = [self._sentence_to_labels(sent) for sent in df_train]
+        self.ent_tagger = sklearn_crfsuite.CRF(
+            algorithm="lbfgs",
+            # coefficient for L1 penalty
+            c1=self.component_config["L1_c"],
+            # coefficient for L2 penalty
+            c2=self.component_config["L2_c"],
+            # stop earlier
+            max_iterations=self.component_config["max_iterations"],
+            # include transitions that are possible, but not observed
+            all_possible_transitions=True,
         )
+        self.ent_tagger.fit(X_train, y_train)

From 68e1999b804f52c019e2fa9a3b6bb055f8e44b2e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 15:21:49 +0100
Subject: [PATCH 452/633] switch to l2 everywhere

---
 rasa/utils/tensorflow/layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index a2bf2e192e2d..fb5bc7442682 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -37,7 +37,7 @@ class DenseForSparse(tf.keras.layers.Dense):
 
     def __init__(self, reg_lambda: float = 0, **kwargs) -> None:
         if reg_lambda > 0:
-            regularizer = tf.keras.regularizers.l1(reg_lambda)
+            regularizer = tf.keras.regularizers.l2(reg_lambda)
         else:
             regularizer = None
 
@@ -217,7 +217,7 @@ class CRF(tf.keras.layers.Layer):
     def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
         super().__init__(name=name)
         self.num_tags = num_tags
-        self.regularizer = tf.keras.regularizers.l1(reg_lambda)
+        self.regularizer = tf.keras.regularizers.l2(reg_lambda)
 
     def build(self, input_shape: tf.TensorShape) -> None:
         # should be created in `build` to apply random_seed

From a2a0f84ef5020e809b93a22b36a61d24a80fac2a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 15:24:34 +0100
Subject: [PATCH 453/633] black

---
 rasa/nlu/extractors/crf_entity_extractor.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index dd104723a6eb..d0319e5cebe6 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -147,7 +147,10 @@ def required_packages(cls) -> List[Text]:
         return ["sklearn_crfsuite", "sklearn"]
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         # checks whether there is at least one
@@ -193,9 +196,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(self.extract_entities(message))
         message.set(
-            ENTITIES,
-            message.get(ENTITIES, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @staticmethod

From ba1778cb9283411ab30e45aa08567b821a539ef4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 15:49:10 +0100
Subject: [PATCH 454/633] add tests

---
 rasa/nlu/components.py   | 12 +++++------
 tests/nlu/test_config.py | 46 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 1fa68425daf7..e80d9936914c 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -2,7 +2,7 @@
 import typing
 from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
 
-from rasa.nlu.config import RasaNLUModelConfig, override_defaults
+from rasa.nlu.config import RasaNLUModelConfig, override_defaults, InvalidConfigError
 from rasa.nlu.constants import RESPONSE
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
@@ -62,7 +62,7 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
             tokenizer_names.append(component.name)
 
     if len(tokenizer_names) > 1:
-        raise Exception(
+        raise InvalidConfigError(
             f"More then one tokenizer is used: {tokenizer_names}. "
             f"You can use only one tokenizer."
         )
@@ -75,7 +75,7 @@ def validate_required_components(pipeline: List["Component"]) -> None:
     for component in pipeline:
         unique_component_names.add(component.name)
         if not set(component.required_components).issubset(unique_component_names):
-            raise Exception(
+            raise InvalidConfigError(
                 f"'{component.name}' requires {component.required_components}. "
                 f"Add required components to the pipeline."
             )
@@ -90,7 +90,7 @@ def validate_arguments(
 
     # Ensure the pipeline is not empty
     if not allow_empty_pipeline and len(pipeline) == 0:
-        raise ValueError(
+        raise InvalidConfigError(
             "Can not train an empty pipeline. "
             "Make sure to specify a proper pipeline in "
             "the configuration using the 'pipeline' key. "
@@ -106,7 +106,7 @@ def validate_arguments(
                 validate_requires_any_of(r, provided_properties, str(component.name))
             else:
                 if r not in provided_properties:
-                    raise Exception(
+                    raise InvalidConfigError(
                         f"Failed to validate component {component.name}. "
                         f"Missing property: '{r}'"
                     )
@@ -135,7 +135,7 @@ def validate_requires_any_of(
     )
 
     if not property_present:
-        raise Exception(
+        raise InvalidConfigError(
             f"Failed to validate component '{component_name}'. "
             f"Missing one of the following properties: "
             f"{required_properties}."
diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py
index 9fabad40b700..38b0c9a05859 100644
--- a/tests/nlu/test_config.py
+++ b/tests/nlu/test_config.py
@@ -7,6 +7,8 @@
 from rasa.nlu import config
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.registry import registered_pipeline_templates
+from rasa.nlu.model import Trainer
+from rasa.nlu.training_data.training_data import TrainingData
 from tests.nlu.utilities import write_file_config
 
 
@@ -34,6 +36,50 @@ def test_invalid_pipeline_template():
     assert "unknown pipeline template" in str(execinfo.value)
 
 
+def test_invalid_many_tokenizers_in_config():
+    nlu_config = {
+        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyTokenizer"}],
+    }
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(nlu_config))
+    assert "More then one tokenizer is used" in str(execinfo.value)
+
+
+def test_invalid_requred_components_in_config():
+    spacy_config = {
+        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyFeaturizer"}],
+    }
+    convert_config = {
+        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}],
+    }
+    lm_config = {
+        "pipeline": [
+            {"name": "ConveRTTokenizer"},
+            {"name": "LanguageModelFeaturizer"},
+        ],
+    }
+    count_vectors_config = {
+        "pipeline": [{"name": "CountVectorsFeaturizer"}],
+    }
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(spacy_config))
+    assert "Add required components to the pipeline" in str(execinfo.value)
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(convert_config))
+    assert "Add required components to the pipeline" in str(execinfo.value)
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(lm_config))
+    assert "Add required components to the pipeline" in str(execinfo.value)
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(count_vectors_config)).train(TrainingData())
+    assert "Missing property" in str(execinfo.value)
+
+
 @pytest.mark.parametrize(
     "pipeline_template", list(registered_pipeline_templates.keys())
 )

From 9cdea3ac4eccd294d8b5cfa76493f13d065f6e2d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 16:12:30 +0100
Subject: [PATCH 455/633] fix test pipelines

---
 .../dense_featurizer/mitie_featurizer.py      |  2 +-
 tests/nlu/test_train.py                       | 87 ++++++++++++++-----
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index f0e8d916c7c9..cd6fbf8dcf5f 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -30,7 +30,7 @@ class MitieFeaturizer(Featurizer):
         TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ] + ["mitie_feature_extractor"]
 
-    required_components = [MitieNLP.name, MitieTokenizer.name]
+    required_components = [MitieNLP.name]
 
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index 282a703755e2..bfd075da0ab6 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -22,59 +22,67 @@ def pipelines_for_tests():
     # tested they still need to be in a useful order - hence we can not simply
     # generate this automatically.
 
+    # Create separate test pipelines for dense featurizers
+    # because they can't co-exist in the same pipeline together,
+    # as their tokenizers break the incoming message into different number of tokens.
+
     # first is language followed by list of components
     return [
         (
             "en",
             as_pipeline(
-                "SpacyNLP",
-                "MitieNLP",
+                "KeywordIntentClassifier",
+            ),
+        ),
+        (
+            "en",
+            as_pipeline(
                 "WhitespaceTokenizer",
-                "MitieTokenizer",
-                "SpacyTokenizer",
-                "MitieFeaturizer",
-                "SpacyFeaturizer",
                 "RegexFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
-                "MitieEntityExtractor",
                 "CRFEntityExtractor",
-                "SpacyEntityExtractor",
                 "DucklingHTTPExtractor",
-                "EntitySynonymMapper",
-                "SklearnIntentClassifier",
-                "MitieIntentClassifier",
-                "EmbeddingIntentClassifier",
                 "DIETClassifier",
-                "KeywordIntentClassifier",
                 "ResponseSelector",
                 "DIETSelector",
+                "EntitySynonymMapper",
             ),
         ),
         (
-            "zh",
+            "en",
             as_pipeline(
-                "MitieNLP",
-                "JiebaTokenizer",
-                "MitieFeaturizer",
-                "MitieEntityExtractor",
+                "SpacyNLP",
+                "SpacyTokenizer",
+                "SpacyFeaturizer",
+                "RegexFeaturizer",
+                "LexicalSyntacticFeaturizer",
+                "CountVectorsFeaturizer",
+                "CRFEntityExtractor",
+                "DucklingHTTPExtractor",
+                "SpacyEntityExtractor",
                 "SklearnIntentClassifier",
-                "KeywordIntentClassifier",
+                "DIETClassifier",
+                "ResponseSelector",
+                "DIETSelector",
+                "EntitySynonymMapper",
             ),
         ),
-        # Create separate test pipelines for dense featurizers because they can't co-exist in the same pipeline
-        # together, as their tokenizers break the incoming message into different number of tokens.
         (
             "en",
             as_pipeline(
                 "HFTransformersNLP",
                 "LanguageModelTokenizer",
                 "LanguageModelFeaturizer",
+                "RegexFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
                 "CRFEntityExtractor",
+                "DucklingHTTPExtractor",
                 "DIETClassifier",
                 "ResponseSelector",
+                "DIETSelector",
+                "EntitySynonymMapper",
             ),
         ),
         (
@@ -82,11 +90,48 @@ def pipelines_for_tests():
             as_pipeline(
                 "ConveRTTokenizer",
                 "ConveRTFeaturizer",
+                "RegexFeaturizer",
                 "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
                 "CRFEntityExtractor",
+                "DucklingHTTPExtractor",
+                "DIETClassifier",
+                "ResponseSelector",
+                "DIETSelector",
+                "EntitySynonymMapper",
+            ),
+        ),
+        (
+            "en",
+            as_pipeline(
+                "MitieNLP",
+                "MitieTokenizer",
+                "MitieFeaturizer",
+                "RegexFeaturizer",
+                "CountVectorsFeaturizer",
+                "MitieEntityExtractor",
+                "DucklingHTTPExtractor",
+                "MitieIntentClassifier",
+                "DIETClassifier",
+                "ResponseSelector",
+                "DIETSelector",
+                "EntitySynonymMapper",
+            ),
+        ),
+        (
+            "zh",
+            as_pipeline(
+                "MitieNLP",
+                "JiebaTokenizer",
+                "MitieFeaturizer",
+                "RegexFeaturizer",
+                "CountVectorsFeaturizer",
+                "MitieEntityExtractor",
+                "MitieIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
+                "DIETSelector",
+                "EntitySynonymMapper",
             ),
         ),
     ]

From 22775e4f12e6103a872dbbf304de380fd5bd3c3e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 16:15:24 +0100
Subject: [PATCH 456/633] black

---
 tests/nlu/test_train.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index bfd075da0ab6..bdc6f0d0d85a 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -28,12 +28,7 @@ def pipelines_for_tests():
 
     # first is language followed by list of components
     return [
-        (
-            "en",
-            as_pipeline(
-                "KeywordIntentClassifier",
-            ),
-        ),
+        ("en", as_pipeline("KeywordIntentClassifier")),
         (
             "en",
             as_pipeline(

From 99c8cdde57486f20c6d3f1ab5d1ab5a789756e54 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 16:57:30 +0100
Subject: [PATCH 457/633] reuse existing methods

---
 rasa/nlu/extractors/crf_entity_extractor.py   | 184 +++-----------
 rasa/nlu/utils/bilou_utils.py                 |  22 +-
 .../extractors/test_crf_entity_extractor.py   | 226 +++++++++++++++++-
 tests/nlu/utils/test_bilou_utils.py           |  17 ++
 4 files changed, 282 insertions(+), 167 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index d0319e5cebe6..5693537a6e31 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -4,30 +4,16 @@
 import numpy as np
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
 
-from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
+import rasa.nlu.utils.bilou_utils as bilou_utils
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    TOKENS_NAMES,
-    TEXT,
-    DENSE_FEATURE_NAMES,
-    SPACY_DOCS,
-    ENTITIES,
-)
-from rasa.constants import (
-    DOCS_BASE_URL,
-    DOCS_URL_TRAINING_DATA_NLU,
-    DOCS_URL_COMPONENTS,
-)
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, DENSE_FEATURE_NAMES, ENTITIES
+from rasa.constants import DOCS_URL_TRAINING_DATA_NLU, DOCS_URL_COMPONENTS
 from rasa.utils.common import raise_warning
 
-try:
-    import spacy
-except ImportError:
-    spacy = None
-
 logger = logging.getLogger(__name__)
 
 if typing.TYPE_CHECKING:
@@ -95,7 +81,9 @@ class CRFEntityExtractor(EntityExtractor):
         "suffix2": lambda crf_token: crf_token.text[-2:],
         "suffix1": lambda crf_token: crf_token.text[-1:],
         "pos": lambda crf_token: crf_token.tag,
-        "pos2": lambda crf_token: crf_token.tag[:2],
+        "pos2": lambda crf_token: crf_token.tag[:2]
+        if crf_token.tag is not None
+        else None,
         "bias": lambda crf_token: "bias",
         "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
         "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
@@ -115,27 +103,6 @@ def __init__(
 
         self._validate_configuration()
 
-        self._check_pos_features_and_spacy()
-
-    def _check_pos_features_and_spacy(self) -> None:
-        import itertools
-
-        features = self.component_config.get("features", [])
-        fts = set(itertools.chain.from_iterable(features))
-        self.pos_features = "pos" in fts or "pos2" in fts
-        if self.pos_features:
-            self._check_spacy()
-
-    @staticmethod
-    def _check_spacy() -> None:
-        if spacy is None:
-            raise ImportError(
-                "Failed to import `spaCy`. "
-                "`spaCy` is required for POS features "
-                "See https://spacy.io/usage/ for installation"
-                "instructions."
-            )
-
     def _validate_configuration(self) -> None:
         if len(self.component_config.get("features", [])) % 2 != 1:
             raise ValueError(
@@ -156,8 +123,6 @@ def train(
         # checks whether there is at least one
         # example with an entity annotation
         if training_data.entity_examples:
-            self._check_spacy_doc(training_data.training_examples[0])
-
             # filter out pre-trained entity examples
             filtered_entity_examples = self.filter_trainable_entities(
                 training_data.training_examples
@@ -174,37 +139,14 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         dataset = []
 
         for example in examples:
-            entity_offsets = self._convert_example(example)
+            entity_offsets = bilou_utils.map_message_entities(example)
             dataset.append(self._from_json_to_crf(example, entity_offsets))
 
         return dataset
 
-    def _check_spacy_doc(self, message: Message) -> None:
-        if self.pos_features and message.get(SPACY_DOCS[TEXT]) is None:
-            raise InvalidConfigError(
-                "Could not find `spacy_doc` attribute for "
-                "message {}\n"
-                "POS features require a pipeline component "
-                "that provides `spacy_doc` attributes, i.e. `SpacyNLP`. "
-                "See {}/nlu/choosing-a-pipeline/#pretrained-embeddings-spacy "
-                "for details".format(message.text, DOCS_BASE_URL)
-            )
-
     def process(self, message: Message, **kwargs: Any) -> None:
-
-        self._check_spacy_doc(message)
-
         extracted = self.add_extractor_name(self.extract_entities(message))
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
-
-    @staticmethod
-    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
-        def convert_entity(entity):
-            return entity["start"], entity["end"], entity["entity"]
-
-        return [convert_entity(ent) for ent in example.get(ENTITIES, [])]
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
@@ -270,16 +212,6 @@ def _create_entity_dict(
             "confidence": confidence,
         }
 
-    @staticmethod
-    def _entity_from_label(label) -> Text:
-        return label[2:]
-
-    @staticmethod
-    def _bilou_from_label(label) -> Optional[Text]:
-        if len(label) >= 2 and label[1] == "-":
-            return label[0].upper()
-        return None
-
     @staticmethod
     def _tokens_without_cls(message: Message) -> List[Token]:
         # [:-1] to remove the CLS token from the list of tokens
@@ -291,7 +223,7 @@ def _find_bilou_end(self, word_idx, entities) -> Any:
 
         # get information about the first word, tagged with `B-...`
         label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = self._entity_from_label(label)
+        entity_label = bilou_utils.entity_name_from_tag(label)
 
         while not finished:
             label, label_confidence = self.most_likely_entity(ent_word_idx, entities)
@@ -328,12 +260,12 @@ def _handle_bilou_label(
         self, word_idx: int, entities: List[Any]
     ) -> Tuple[Any, Any, Any]:
         label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = self._entity_from_label(label)
+        entity_label = bilou_utils.entity_name_from_tag(label)
 
-        if self._bilou_from_label(label) == "U":
+        if bilou_utils.bilou_prefix_from_tag(label) == "U":
             return word_idx, confidence, entity_label
 
-        elif self._bilou_from_label(label) == "B":
+        elif bilou_utils.bilou_prefix_from_tag(label) == "B":
             # start of multi word-entity need to represent whole extent
             ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
             return ent_word_idx, confidence, entity_label
@@ -345,10 +277,7 @@ def _from_crf_to_json(
         self, message: Message, entities: List[Any]
     ) -> List[Dict[Text, Any]]:
 
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT])
-        else:
-            tokens = self._tokens_without_cls(message)
+        tokens = self._tokens_without_cls(message)
 
         if len(tokens) != len(entities):
             raise Exception(
@@ -393,15 +322,9 @@ def _convert_simple_tagging_to_entity_result(
             entity_label, confidence = self.most_likely_entity(word_idx, entities)
             word = tokens[word_idx]
             if entity_label != "O":
-                if self.pos_features and not isinstance(word, Token):
-                    start = word.idx
-                    end = word.idx + len(word)
-                else:
-                    start = word.start
-                    end = word.end
                 ent = {
-                    "start": start,
-                    "end": end,
+                    "start": word.start,
+                    "end": word.end,
                     "value": word.text,
                     "entity": entity_label,
                     "confidence": confidence,
@@ -458,6 +381,7 @@ def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any
             feature_range = range(-half_span, half_span + 1)
             prefixes = [str(i) for i in feature_range]
             word_features = {}
+
             for f_i in feature_range:
                 if word_idx + f_i >= len(sentence):
                     word_features["EOS"] = True
@@ -479,10 +403,16 @@ def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any
                                 feature_name = prefix + ":" + feature + ":" + p_name
                                 word_features[feature_name] = matched
                             # pytype: enable=attribute-error
+                        elif feature == "pos" or feature == "pos2":
+                            if word is None:
+                                continue
+                            value = self.function_dict[feature](word)
+                            word_features[prefix + ":" + feature] = value
                         else:
                             # append each feature to a feature vector
                             value = self.function_dict[feature](word)
                             word_features[prefix + ":" + feature] = value
+
             sentence_features.append(word_features)
         return sentence_features
 
@@ -506,19 +436,12 @@ def _from_json_to_crf(
     ) -> List[CRFToken]:
         """Convert json examples to format of underlying crfsuite."""
 
-        if self.pos_features:
-            from spacy.gold import GoldParse  # pytype: disable=import-error
-
-            doc_or_tokens = message.get(SPACY_DOCS[TEXT])
-            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
-            ents = [l[5] for l in gold.orig_annot]
-        else:
-            doc_or_tokens = self._tokens_without_cls(message)
-            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
+        tokens = self._tokens_without_cls(message)
+        ents = bilou_utils.bilou_tags_from_offsets(tokens, entity_offsets)
 
         # collect badly annotated examples
         collected = []
-        for t, e in zip(doc_or_tokens, ents):
+        for t, e in zip(tokens, ents):
             if e == "-":
                 collected.append(t)
             elif collected:
@@ -537,45 +460,12 @@ def _from_json_to_crf(
 
         if not self.component_config["BILOU_flag"]:
             for i, label in enumerate(ents):
-                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
+                if bilou_utils.bilou_prefix_from_tag(label) in {"B", "I", "U", "L"}:
                     # removes BILOU prefix from label
-                    ents[i] = self._entity_from_label(label)
+                    ents[i] = bilou_utils.entity_name_from_tag(label)
 
         return self._from_text_to_crf(message, ents)
 
-    @staticmethod
-    def _bilou_tags_from_offsets(tokens, entities, missing: Text = "O") -> List[Text]:
-        # From spacy.spacy.GoldParse, under MIT License
-        starts = {token.start: i for i, token in enumerate(tokens)}
-        ends = {token.end: i for i, token in enumerate(tokens)}
-        bilou = ["-" for _ in tokens]
-        # Handle entity cases
-        for start_char, end_char, label in entities:
-            start_token = starts.get(start_char)
-            end_token = ends.get(end_char)
-            # Only interested if the tokenization is correct
-            if start_token is not None and end_token is not None:
-                if start_token == end_token:
-                    bilou[start_token] = "U-%s" % label
-                else:
-                    bilou[start_token] = "B-%s" % label
-                    for i in range(start_token + 1, end_token):
-                        bilou[i] = "I-%s" % label
-                    bilou[end_token] = "L-%s" % label
-        # Now distinguish the O cases from ones where we miss the tokenization
-        entity_chars = set()
-        for start_char, end_char, label in entities:
-            for i in range(start_char, end_char):
-                entity_chars.add(i)
-        for n, token in enumerate(tokens):
-            for i in range(token.start, token.end):
-                if i in entity_chars:
-                    break
-            else:
-                bilou[n] = missing
-
-        return bilou
-
     @staticmethod
     def __pattern_of_token(message: Message, i: int) -> Dict:
         if message.get(TOKENS_NAMES[TEXT]) is not None:
@@ -583,13 +473,6 @@ def __pattern_of_token(message: Message, i: int) -> Dict:
         else:
             return {}
 
-    @staticmethod
-    def __tag_of_token(token: Any) -> Text:
-        if spacy.about.__version__ > "2" and token._.has("tag"):
-            return token._.get("tag")
-        else:
-            return token.tag_
-
     @staticmethod
     def __get_dense_features(message: Message) -> Optional[List[Any]]:
         features = message.get(DENSE_FEATURE_NAMES[TEXT])
@@ -602,7 +485,9 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
             raise_warning(
                 f"Number of features ({len(features)}) for attribute "
                 f"'{DENSE_FEATURE_NAMES[TEXT]}' "
-                f"does not match number of tokens ({len(tokens)}).",
+                f"does not match number of tokens ({len(tokens)}). Set "
+                f"'return_sequence' to true in the corresponding featurizer in order "
+                f"to make use of the features in 'CRFEntityExtractor'.",
                 docs=DOCS_URL_COMPONENTS + "#crfentityextractor",
             )
             return None
@@ -624,17 +509,14 @@ def _from_text_to_crf(
         """Takes a sentence and switches it to crfsuite format."""
 
         crf_format = []
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT])
-        else:
-            tokens = self._tokens_without_cls(message)
+        tokens = self._tokens_without_cls(message)
 
         text_dense_features = self.__get_dense_features(message)
 
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
-            tag = self.__tag_of_token(token) if self.pos_features else None
+            tag = token.get("pos")
             dense_features = (
                 text_dense_features[i] if text_dense_features is not None else []
             )
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 73efd6faaecb..1e8c42612170 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -3,16 +3,18 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.constants import (
-    ENTITIES,
-    TOKENS_NAMES,
-    TEXT,
-    BILOU_ENTITIES,
-)
+from rasa.nlu.constants import ENTITIES, TOKENS_NAMES, TEXT, BILOU_ENTITIES
 
 BILOU_PREFIXES = ["B-", "I-", "U-", "L-"]
 
 
+def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
+    """Remove the BILOU prefix from the given tag."""
+    if tag[:2] in BILOU_PREFIXES:
+        return tag[0]
+    return None
+
+
 def entity_name_from_tag(tag: Text) -> Text:
     """Remove the BILOU prefix from the given tag."""
     if tag[:2] in BILOU_PREFIXES:
@@ -68,13 +70,13 @@ def apply_bilou_schema(training_data: TrainingData):
         if not entities:
             continue
 
-        entities = _map_message_entities(message)
-        output = _bilou_tags_from_offsets(message.get(TOKENS_NAMES[TEXT]), entities)
+        entities = map_message_entities(message)
+        output = bilou_tags_from_offsets(message.get(TOKENS_NAMES[TEXT]), entities)
 
         message.set(BILOU_ENTITIES, output)
 
 
-def _map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
+def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
     """Maps the entities of the given message to their start, end, and tag values."""
 
     def convert_entity(entity):
@@ -83,7 +85,7 @@ def convert_entity(entity):
     return [convert_entity(entity) for entity in message.get(ENTITIES, [])]
 
 
-def _bilou_tags_from_offsets(
+def bilou_tags_from_offsets(
     tokens: List[Token], entities: List[Tuple[int, int, Text]], missing: Text = "O"
 ) -> List[Text]:
     """Creates a list of BILOU tags for the given list of tokens and entities."""
diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index f581ae36536d..6d90b7f97344 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -1,8 +1,9 @@
-from rasa.nlu.constants import TEXT, SPACY_DOCS, ENTITIES
+from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+from rasa.nlu.constants import TEXT, SPACY_DOCS, ENTITIES
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-from rasa.utils.tensorflow.constants import EPOCHS, RANDOM_SEED
 
 
 def test_crf_extractor(spacy_nlp):
@@ -48,12 +49,10 @@ def test_crf_extractor(spacy_nlp):
                 ["low", "title", "upper", "pos", "pos2"],
                 ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
                 ["low", "title", "upper", "pos", "pos2"],
-            ],
-            RANDOM_SEED: 1,
-            EPOCHS: 75,
+            ]
         }
     )
-    tokenizer = WhitespaceTokenizer()
+    tokenizer = SpacyTokenizer()
 
     training_data = TrainingData(training_examples=examples)
     tokenizer.train(training_data)
@@ -70,3 +69,218 @@ def test_crf_extractor(spacy_nlp):
     assert len(detected_entities) == 1
     assert detected_entities[0]["entity"] == "cuisine"
     assert detected_entities[0]["value"] == "italian"
+
+
+def test_crf_json_from_BILOU(spacy_nlp):
+    ext = CRFEntityExtractor(
+        component_config={
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
+                ["low", "title", "upper", "pos", "pos2"],
+            ]
+        }
+    )
+
+    sentence = "I need a home cleaning close-by"
+
+    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})
+
+    tokenizer = SpacyTokenizer()
+    tokenizer.process(message)
+
+    r = ext._from_crf_to_json(
+        message,
+        [
+            {"O": 1.0},
+            {"O": 1.0},
+            {"O": 1.0},
+            {"B-what": 1.0},
+            {"L-what": 1.0},
+            {"B-where": 1.0},
+            {"I-where": 1.0},
+            {"L-where": 1.0},
+        ],
+    )
+    assert len(r) == 2, "There should be two entities"
+
+    assert r[0]["confidence"]  # confidence should exist
+    del r[0]["confidence"]
+    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}
+
+    assert r[1]["confidence"]  # confidence should exist
+    del r[1]["confidence"]
+    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
+
+
+def test_crf_json_from_non_BILOU(spacy_nlp):
+    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+
+    ext = CRFEntityExtractor(
+        component_config={
+            "BILOU_flag": False,
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
+                ["low", "title", "upper", "pos", "pos2"],
+            ],
+        }
+    )
+    sentence = "I need a home cleaning close-by"
+
+    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})
+
+    tokenizer = SpacyTokenizer()
+    tokenizer.process(message)
+
+    rs = ext._from_crf_to_json(
+        message,
+        [
+            {"O": 1.0},
+            {"O": 1.0},
+            {"O": 1.0},
+            {"what": 1.0},
+            {"what": 1.0},
+            {"where": 1.0},
+            {"where": 1.0},
+            {"where": 1.0},
+        ],
+    )
+
+    # non BILOU will split multi-word entities - hence 5
+    assert len(rs) == 5, "There should be five entities"
+
+    for r in rs:
+        assert r["confidence"]  # confidence should exist
+        del r["confidence"]
+
+    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
+    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
+    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
+    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
+    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
+
+
+def test_crf_create_entity_dict(spacy_nlp):
+    crf_extractor = CRFEntityExtractor()
+    spacy_tokenizer = SpacyTokenizer()
+    white_space_tokenizer = WhitespaceTokenizer()
+
+    examples = [
+        {
+            "message": Message(
+                "where is St. Michael's Hospital?",
+                {
+                    "intent": "search_location",
+                    "entities": [
+                        {
+                            "start": 9,
+                            "end": 31,
+                            "value": "St. Michael's Hospital",
+                            "entity": "hospital",
+                            "SpacyTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 5,
+                            },
+                            "WhitespaceTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 5,
+                            },
+                        }
+                    ],
+                    SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"),
+                },
+            )
+        },
+        {
+            "message": Message(
+                "where is Children's Hospital?",
+                {
+                    "intent": "search_location",
+                    "entities": [
+                        {
+                            "start": 9,
+                            "end": 28,
+                            "value": "Children's Hospital",
+                            "entity": "hospital",
+                            "SpacyTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 4,
+                            },
+                            "WhitespaceTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 4,
+                            },
+                        }
+                    ],
+                    SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"),
+                },
+            )
+        },
+    ]
+    for ex in examples:
+        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
+        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT)
+        white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT)
+        for tokenizer, tokens in [
+            ("SpacyTokenizer", spacy_tokens),
+            ("WhitespaceTokenizer", white_space_tokens),
+        ]:
+            for entity in ex["message"].get("entities"):
+                parsed_entities = crf_extractor._create_entity_dict(
+                    ex["message"],
+                    tokens,
+                    entity[tokenizer]["entity_start_token_idx"],
+                    entity[tokenizer]["entity_end_token_idx"],
+                    entity["entity"],
+                    0.8,
+                )
+                assert parsed_entities == {
+                    "start": entity["start"],
+                    "end": entity["end"],
+                    "value": entity["value"],
+                    "entity": entity["entity"],
+                    "confidence": 0.8,
+                }
+
+
+def test_crf_use_dense_features(spacy_nlp):
+    crf_extractor = CRFEntityExtractor(
+        component_config={
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                [
+                    "low",
+                    "suffix3",
+                    "suffix2",
+                    "upper",
+                    "title",
+                    "digit",
+                    "pos",
+                    "pos2",
+                    "text_dense_features",
+                ],
+                ["low", "title", "upper", "pos", "pos2"],
+            ]
+        }
+    )
+
+    spacy_featurizer = SpacyFeaturizer()
+    spacy_tokenizer = SpacyTokenizer()
+
+    text = "Rasa is a company in Berlin"
+    message = Message(text)
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+
+    spacy_tokenizer.process(message)
+    spacy_featurizer.process(message)
+
+    text_data = crf_extractor._from_text_to_crf(message)
+    features = crf_extractor._sentence_to_features(text_data)
+
+    assert "0:text_dense_features" in features[0]
+    for i in range(0, len(message.data.get("text_dense_features")[0])):
+        assert (
+            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
+            == message.data.get("text_dense_features")[0][i]
+        )
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
index a27c39672c06..0efc08ecb5d8 100644
--- a/tests/nlu/utils/test_bilou_utils.py
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -22,6 +22,23 @@ def test_entity_name_from_tag(tag, expected):
     assert actual == expected
 
 
+@pytest.mark.parametrize(
+    "tag, expected",
+    [
+        ("B-person", "B"),
+        ("I-location", "I"),
+        ("location", None),
+        ("U-company", "U"),
+        ("L-company", "L"),
+        ("O-company", None),
+    ],
+)
+def test_bilou_from_tag(tag, expected):
+    actual = bilou_utils.bilou_prefix_from_tag(tag)
+
+    assert actual == expected
+
+
 def test_tags_to_ids():
     message = Message("Germany is part of the European Union")
     message.set(

From a175f3cdc2a998e236dfdb09abf1cc4488e17831 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 17:13:31 +0100
Subject: [PATCH 458/633] update docs

---
 docs/nlu/components.rst                     | 128 +++++++-------------
 rasa/nlu/extractors/crf_entity_extractor.py |   2 +-
 2 files changed, 46 insertions(+), 84 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index bc57f38e3143..a8bee2c7cc7a 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1330,73 +1330,62 @@ CRFEntityExtractor
     etc.) give probabilities to certain entity classes, as are transitions between
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
 
+    .. note::
+        If POS features are used (pos or pos2), you need to have ``SpacyTokenizer`` in your pipeline.
+
     .. warning::
         ``CRFEntityExtractor`` is deprecated and should be replaced by ``DIETClassifier``. See
         `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
 
 :Configuration:
+    You need to configure what kind of features the CRF should use.
+    The following features are available:
 
-    The following hyperparameters can be set:
-
-        - neural network's architecture:
-
-            - ``hidden_layers_sizes.text`` sets a list of hidden layer sizes before
-              the embedding layer for user inputs, the number of hidden layers
-              is equal to the length of the list.
-
-        - training:
-
-            - ``batch_size`` sets the number of training examples in one
-              forward/backward pass, the higher the batch size, the more
-              memory space you'll need.
-            - ``epochs`` sets the number of times the algorithm will see
-              training data, where one ``epoch`` equals one forward pass and
-              one backward pass of all the training examples.
-            - ``random_seed`` if set you will get reproducible
-              training results for the same inputs.
-            - ``learning_rate`` sets the initial learning rate of the optimizer.
-
-        - embedding:
-
-            - ``dense_dimension.text`` sets the dense dimensions for user inputs to use for sparse
-              tensors if no dense features are present.
-
-        - regularization:
-
-            - ``regularization_constant`` sets the scale of L2 regularization.
-            - ``droprate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
-            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
-
-        - model configuration:
-
-            - ``features`` indicates what features to use. ``CRFEntityExtractor`` is using the same featurization
-              as ``LexicalSyntacticFeaturizer``. See :ref:`LexicalSyntacticFeaturizer` for details on what kind
-              of features are available.
-            - ``BILOU_flag`` determines whether to use BILOU tagging or not.
+    ===================  =============================================================================================
+    Feature Name         Description
+    ===================  =============================================================================================
+    low                  Checks if the token is lower case.
+    upper                Checks if the token is upper case.
+    title                Checks if the token starts with an uppercase character and all remaining characters are
+                         lowercased.
+    digit                Checks if the token contains just digits.
+    prefix5              Take the first five characters of the token.
+    prefix2              Take the first two characters of the token.
+    suffix5              Take the last five characters of the token.
+    suffix3              Take the last three characters of the token.
+    suffix2              Take the last two characters of the token.
+    suffix1              Take the last character of the token.
+    pos                  Take the Part-of-Speech tag of the token (spaCy required).
+    pos2                 Take the first two characters of the Part-of-Speech tag of the token (spaCy required).
+    pattern              Take the patterns defined by ``RegexFeaturizer``.
+    ===================  =============================================================================================
 
-    .. note:: There is an option to use linearly increasing batch size. The idea comes from
-              `<https://arxiv.org/abs/1711.00489>`_.
-              In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
-              If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
+    As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
+    previous tokens, the current token, and the next tokens in the sliding window.
+    You define the features as [before, token, after] array.
 
+    Additional you can set a flag to determine whether to use the BILOU tagging schema or not.
 
-    Default values:
+        - ``BILOU_flag`` determines whether to use BILOU tagging or not.
 
     .. code-block:: yaml
 
         pipeline:
         - name: "CRFEntityExtractor"
-            # 'features' is [before, word, after] array with before, word,
-            # after holding keys about which features to use for each word,
-            # for example, 'title' in array before will have the feature
+            # BILOU_flag determines whether to use BILOU tagging or not.
+            # More rigorous however requires more examples per entity
+            # rule of thumb: use only if more than 100 egs. per entity
+            "BILOU_flag": True
+            # crf_features is [before, word, after] array with before, word,
+            # after holding keys about which
+            # features to use for each word, for example, 'title' in
+            # array before will have the feature
             # "is the preceding word in title case?"
-            # POS features require 'SpacyTokenizer'.
+            # POS features require spaCy to be installed
             "features": [
                 ["low", "title", "upper"],
                 [
-                    "BOS",
-                    "EOS",
+                    "bias",
                     "low",
                     "prefix5",
                     "prefix2",
@@ -1406,43 +1395,16 @@ CRFEntityExtractor
                     "upper",
                     "title",
                     "digit",
+                    "pattern",
                 ],
                 ["low", "title", "upper"],
             ]
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [256, 128]}
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
-            "batch_size": [64, 256]
-            # number of epochs
-            "epochs": 300
-            # set random seed to any int to get reproducible results
-            "random_seed": None
-            # optimizer
-            "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512}
-            # regularization parameters
-            # the scale of regularization
-            "regularization_constant": 0.002
-            # dropout rate for rnn
-            "droprate": 0.2
-            # if true apply dropout to sparse tensors
-            "use_sparse_input_dropout": True
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
-            # BILOU_flag determines whether to use BILOU tagging or not.
-            # More rigorous however requires more examples per entity
-            # rule of thumb: use only if more than 100 egs. per entity
-            "BILOU_flag": False
+            # The maximum number of iterations for optimization algorithms.
+            "max_iterations": 50
+            # weight of the L1 regularization
+            "L1_c": 0.1
+            # weight of the L2 regularization
+            "L2_c": 0.1
 
 .. _DucklingHTTPExtractor:
 
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 5693537a6e31..a950716dbbda 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -65,7 +65,7 @@ class CRFEntityExtractor(EntityExtractor):
         ],
         # The maximum number of iterations for optimization algorithms.
         "max_iterations": 50,
-        # weight of theL1 regularization
+        # weight of the L1 regularization
         "L1_c": 0.1,
         # weight of the L2 regularization
         "L2_c": 0.1,

From 2623866201e1020a0cac5b40e40a1d7e757c3150 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 17:15:30 +0100
Subject: [PATCH 459/633] black formatting

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index e1b19332b230..c1f4cfee8e5a 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -221,7 +221,7 @@ def train(
                     )
 
                 batch_start_index += batch_size
-        print('\n')
+        print("\n")
 
     def process(self, message: Message, **kwargs: Any) -> None:
 

From 9a34f59212fdf0cf0a83577b000acd27236522bc Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 19 Feb 2020 17:30:36 +0100
Subject: [PATCH 460/633] fix pbar in convertfeaturizer

---
 .../dense_featurizer/convert_featurizer.py           | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index e1b19332b230..35b57cfaca36 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -193,15 +193,11 @@ def train(
                 filter(lambda x: x.get(attribute), training_data.training_examples)
             )
 
-            batch_start_index = 0
             pbar = tqdm(
-                total=(len(non_empty_examples) // batch_size) + 1,
+                range(0, len(non_empty_examples), batch_size),
                 desc=attribute.capitalize() + " batches:",
             )
-
-            while batch_start_index < len(non_empty_examples):
-                pbar.update(1)
-
+            for batch_start_index in pbar:
                 batch_end_index = min(
                     batch_start_index + batch_size, len(non_empty_examples)
                 )
@@ -212,7 +208,6 @@ def train(
                 batch_features = self._compute_features(batch_examples, attribute)
 
                 for index, ex in enumerate(batch_examples):
-
                     ex.set(
                         DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
@@ -220,9 +215,6 @@ def train(
                         ),
                     )
 
-                batch_start_index += batch_size
-        print('\n')
-
     def process(self, message: Message, **kwargs: Any) -> None:
 
         features = self._compute_features([message])[0]

From 3952704e049f3a082bfe025bb63265aaaeaf5e30 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 18:28:52 +0100
Subject: [PATCH 461/633] raise deprecation warning

---
 rasa/nlu/extractors/crf_entity_extractor.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index a950716dbbda..147a62d0397b 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -103,6 +103,13 @@ def __init__(
 
         self._validate_configuration()
 
+        raise_warning(
+            "'CRFEntityExtractor' is deprecated and will be removed in version "
+            "2.0. Use 'DIETClassifier' instead.",
+            category=FutureWarning,
+            docs=DOCS_URL_COMPONENTS,
+        )
+
     def _validate_configuration(self) -> None:
         if len(self.component_config.get("features", [])) % 2 != 1:
             raise ValueError(

From 2fa3106396d78432b7da71e9da630b4a2b078df7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 19 Feb 2020 18:33:36 +0100
Subject: [PATCH 462/633] fix test

---
 tests/nlu/test_train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index bdc6f0d0d85a..9037d0c7971d 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -39,6 +39,7 @@ def pipelines_for_tests():
                 "CRFEntityExtractor",
                 "DucklingHTTPExtractor",
                 "DIETClassifier",
+                "EmbeddingIntentClassifier",
                 "ResponseSelector",
                 "DIETSelector",
                 "EntitySynonymMapper",

From 79451dbe4fdbc794d1c037a5da6ba12edc242255 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 09:12:34 +0100
Subject: [PATCH 463/633] add bias feature again

---
 docs/nlu/components.rst                       | 63 ++++++++++---------
 rasa/nlu/extractors/crf_entity_extractor.py   | 54 +++++++---------
 .../extractors/test_crf_entity_extractor.py   | 12 +++-
 3 files changed, 69 insertions(+), 60 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index a8bee2c7cc7a..5aa4d258f048 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1333,6 +1333,9 @@ CRFEntityExtractor
     .. note::
         If POS features are used (pos or pos2), you need to have ``SpacyTokenizer`` in your pipeline.
 
+    .. note::
+        If "pattern" features are used, you need to have ``RegexFeaturizer`` in your pipeline.
+
     .. warning::
         ``CRFEntityExtractor`` is deprecated and should be replaced by ``DIETClassifier``. See
         `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
@@ -1341,24 +1344,26 @@ CRFEntityExtractor
     You need to configure what kind of features the CRF should use.
     The following features are available:
 
-    ===================  =============================================================================================
-    Feature Name         Description
-    ===================  =============================================================================================
-    low                  Checks if the token is lower case.
-    upper                Checks if the token is upper case.
-    title                Checks if the token starts with an uppercase character and all remaining characters are
-                         lowercased.
-    digit                Checks if the token contains just digits.
-    prefix5              Take the first five characters of the token.
-    prefix2              Take the first two characters of the token.
-    suffix5              Take the last five characters of the token.
-    suffix3              Take the last three characters of the token.
-    suffix2              Take the last two characters of the token.
-    suffix1              Take the last character of the token.
-    pos                  Take the Part-of-Speech tag of the token (spaCy required).
-    pos2                 Take the first two characters of the Part-of-Speech tag of the token (spaCy required).
-    pattern              Take the patterns defined by ``RegexFeaturizer``.
-    ===================  =============================================================================================
+    ===============  =============================================================================
+    Feature Name     Description
+    ===============  =============================================================================
+    low              Checks if the token is lower case.
+    upper            Checks if the token is upper case.
+    title            Checks if the token starts with an uppercase character and all remaining
+                     characters are lowercased.
+    digit            Checks if the token contains just digits.
+    prefix5          Take the first five characters of the token.
+    prefix2          Take the first two characters of the token.
+    suffix5          Take the last five characters of the token.
+    suffix3          Take the last three characters of the token.
+    suffix2          Take the last two characters of the token.
+    suffix1          Take the last character of the token.
+    pos              Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
+    pos2             Take the first two characters of the Part-of-Speech tag of the token
+                     (``SpacyTokenizer`` required).
+    pattern          Take the patterns defined by ``RegexFeaturizer``.
+    bias             Add an additional "bias" feature to the list of features.
+    ===============  =============================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
     previous tokens, the current token, and the next tokens in the sliding window.
@@ -1375,13 +1380,13 @@ CRFEntityExtractor
             # BILOU_flag determines whether to use BILOU tagging or not.
             # More rigorous however requires more examples per entity
             # rule of thumb: use only if more than 100 egs. per entity
-            "BILOU_flag": True
-            # crf_features is [before, word, after] array with before, word,
-            # after holding keys about which
-            # features to use for each word, for example, 'title' in
-            # array before will have the feature
-            # "is the preceding word in title case?"
-            # POS features require spaCy to be installed
+            "BILOU_flag": True,
+            # crf_features is [before, token, after] array with before, token,
+            # after holding keys about which features to use for each token,
+            # for example, 'title' in array before will have the feature
+            # "is the preceding token in title case?"
+            # POS features require SpacyTokenizer
+            # pattern feature require RegexFeaturizer
             "features": [
                 ["low", "title", "upper"],
                 [
@@ -1398,13 +1403,13 @@ CRFEntityExtractor
                     "pattern",
                 ],
                 ["low", "title", "upper"],
-            ]
+            ],
             # The maximum number of iterations for optimization algorithms.
-            "max_iterations": 50
+            "max_iterations": 50,
             # weight of the L1 regularization
-            "L1_c": 0.1
+            "L1_c": 0.1,
             # weight of the L2 regularization
-            "L2_c": 0.1
+            "L2_c": 0.1,
 
 .. _DucklingHTTPExtractor:
 
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 147a62d0397b..36ea46d21a02 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -18,7 +18,6 @@
 
 if typing.TYPE_CHECKING:
     from sklearn_crfsuite import CRF
-    from spacy.tokens import Doc
 
 
 class CRFToken(NamedTuple):
@@ -40,18 +39,17 @@ class CRFEntityExtractor(EntityExtractor):
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
         "BILOU_flag": True,
-        # crf_features is [before, word, after] array with before, word,
-        # after holding keys about which
-        # features to use for each word, for example, 'title' in
-        # array before will have the feature
-        # "is the preceding word in title case?"
-        # POS features require spaCy to be installed
+        # crf_features is [before, token, after] array with before, token,
+        # after holding keys about which features to use for each token,
+        # for example, 'title' in array before will have the feature
+        # "is the preceding token in title case?"
+        # POS features require SpacyTokenizer
+        # pattern feature require RegexFeaturizer
         "features": [
             ["low", "title", "upper"],
             [
-                "bias",
                 "low",
-                "prefix5",
+                "bias" "prefix5",
                 "prefix2",
                 "suffix5",
                 "suffix3",
@@ -72,21 +70,21 @@ class CRFEntityExtractor(EntityExtractor):
     }
 
     function_dict = {
-        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
-        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
+        "low": lambda crf_token: crf_token.text.lower(),
+        "title": lambda crf_token: crf_token.text.istitle(),
         "prefix5": lambda crf_token: crf_token.text[:5],
         "prefix2": lambda crf_token: crf_token.text[:2],
         "suffix5": lambda crf_token: crf_token.text[-5:],
         "suffix3": lambda crf_token: crf_token.text[-3:],
         "suffix2": lambda crf_token: crf_token.text[-2:],
         "suffix1": lambda crf_token: crf_token.text[-1:],
+        "bias": lambda crf_token: "bias",
         "pos": lambda crf_token: crf_token.tag,
         "pos2": lambda crf_token: crf_token.tag[:2]
         if crf_token.tag is not None
         else None,
-        "bias": lambda crf_token: "bias",
-        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
-        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
+        "upper": lambda crf_token: crf_token.text.isupper(),
+        "digit": lambda crf_token: crf_token.text.isdigit(),
         "pattern": lambda crf_token: crf_token.pattern,
         "text_dense_features": lambda crf_token: crf_token.dense_features,
     }
@@ -187,29 +185,25 @@ def most_likely_entity(self, idx: int, entities: List[Any]) -> Tuple[Text, Any]:
         else:
             return "", 0.0
 
+    @staticmethod
     def _create_entity_dict(
-        self,
         message: Message,
-        tokens: Union["Doc", List[Token]],
+        tokens: List[Token],
         start: int,
         end: int,
         entity: str,
         confidence: float,
     ) -> Dict[Text, Any]:
-        if isinstance(tokens, list):  # tokens is a list of Token
-            _start = tokens[start].start
-            _end = tokens[end].end
-            value = tokens[start].text
-            value += "".join(
-                [
-                    message.text[tokens[i - 1].end : tokens[i].start] + tokens[i].text
-                    for i in range(start + 1, end + 1)
-                ]
-            )
-        else:  # tokens is a Doc
-            _start = tokens[start].idx
-            _end = tokens[start : end + 1].end_char
-            value = tokens[start : end + 1].text
+
+        _start = tokens[start].start
+        _end = tokens[end].end
+        value = tokens[start].text
+        value += "".join(
+            [
+                message.text[tokens[i - 1].end : tokens[i].start] + tokens[i].text
+                for i in range(start + 1, end + 1)
+            ]
+        )
 
         return {
             "start": _start,
diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
index 6d90b7f97344..b2cebd0ef42a 100644
--- a/tests/nlu/extractors/test_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -76,7 +76,17 @@ def test_crf_json_from_BILOU(spacy_nlp):
         component_config={
             "features": [
                 ["low", "title", "upper", "pos", "pos2"],
-                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
+                [
+                    "low",
+                    "bias",
+                    "suffix3",
+                    "suffix2",
+                    "upper",
+                    "title",
+                    "digit",
+                    "pos",
+                    "pos2",
+                ],
                 ["low", "title", "upper", "pos", "pos2"],
             ]
         }

From d11a4eb4df8ab9325290581aa0d94b35a8a22ab0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 09:27:51 +0100
Subject: [PATCH 464/633] added changelog entries

---
 changelog/5266.feature.rst     | 13 +++++++++++++
 changelog/5266.improvement.rst |  1 +
 changelog/5266.misc.rst        |  9 +++++++++
 3 files changed, 23 insertions(+)
 create mode 100644 changelog/5266.feature.rst
 create mode 100644 changelog/5266.improvement.rst
 create mode 100644 changelog/5266.misc.rst

diff --git a/changelog/5266.feature.rst b/changelog/5266.feature.rst
new file mode 100644
index 000000000000..61b6d5576303
--- /dev/null
+++ b/changelog/5266.feature.rst
@@ -0,0 +1,13 @@
+Added a new NLU components ``DIETClassifier`` and a new policy ``TEDPolicy``.
+
+DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
+recognition. You can read more about this component in our
+`documentation <https://rasa.com/docs/rasa/nlu/components/#diet-classifier>`_.
+The new component will replace the ``EmbeddingIntentClassifier`` and the ``CRFEntityExtractor`` in the future.
+Those two components are deprecated from now on.
+See `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for details on how to
+switch to the new component.
+
+``TEDPolicy`` is the new name for ``EmbeddingPolicy``. ``EmbeddingPolicy`` is deprecated from now on.
+The functionality of ``TEDPolicy`` and ``EmbeddingPolicy`` is the same. Please update your configuration file
+to use the new name for the policy.
diff --git a/changelog/5266.improvement.rst b/changelog/5266.improvement.rst
new file mode 100644
index 000000000000..1d0a19321893
--- /dev/null
+++ b/changelog/5266.improvement.rst
@@ -0,0 +1 @@
+We updated our code to Tensorflow 2.0.
diff --git a/changelog/5266.misc.rst b/changelog/5266.misc.rst
new file mode 100644
index 000000000000..19da19d66e03
--- /dev/null
+++ b/changelog/5266.misc.rst
@@ -0,0 +1,9 @@
+We deprecated all existing pipeline templates, ``SklearnIntentClassifier`` and ``KerasPolicy``.
+
+Please list the components you want to use directly in your configuration file.
+Check out `Choosing a Pipeline <https://rasa.com/docs/rasa/nlu/choosing-a-pipeline/>`_ to decide what components to
+include in your pipeline.
+
+Use ``DIETClassifier`` instead of ``SklearnIntentClassifier``.
+
+Use ``TEDPolicy`` instead of ``KerasPolicy``.

From 828b7956f32966310854cfe93b43ff61a33c0584 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 09:35:57 +0100
Subject: [PATCH 465/633] review comments

---
 data/test/config_embedding_test.yml                    |  2 +-
 .../config_supervised_embeddings_duckling.yml          |  2 +-
 .../configs/config_pretrained_embeddings_spacy_de.yml  |  2 +-
 .../configs/config_pretrained_embeddings_spacy_en.yml  |  2 +-
 .../configs/config_supervised_embeddings_duckling.yml  |  1 -
 docs/core/old-core-change-log.rst                      |  2 +-
 docs/core/policies.rst                                 |  2 +-
 docs/migration-guide.rst                               | 10 +++++-----
 8 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/data/test/config_embedding_test.yml b/data/test/config_embedding_test.yml
index 40f570d6627f..1c9eb116fb09 100644
--- a/data/test/config_embedding_test.yml
+++ b/data/test/config_embedding_test.yml
@@ -3,4 +3,4 @@ pipeline:
 - name: "CountVectorsFeaturizer"
   max_ngram: 3
 - name: "DIETClassifier"
-  epochs: 10
\ No newline at end of file
+  epochs: 10
diff --git a/data/test_config/config_supervised_embeddings_duckling.yml b/data/test_config/config_supervised_embeddings_duckling.yml
index 8df9cb707711..c1771ea5addc 100644
--- a/data/test_config/config_supervised_embeddings_duckling.yml
+++ b/data/test_config/config_supervised_embeddings_duckling.yml
@@ -2,6 +2,6 @@ language: "en"
 
 pipeline:
 - name: "CountVectorsFeaturizer"
-- name: "DIETClassifier"
+- name: "EmbeddingIntentClassifier"
 - name: "DucklingHTTPExtractor"
   url: "http://duckling:8000"
diff --git a/docker/configs/config_pretrained_embeddings_spacy_de.yml b/docker/configs/config_pretrained_embeddings_spacy_de.yml
index e4ab976a258d..5a5c1b64d985 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_de.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_de.yml
@@ -13,4 +13,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: DIETSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_en.yml b/docker/configs/config_pretrained_embeddings_spacy_en.yml
index 244b5dc1a9a4..697a4d9eae09 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_en.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_en.yml
@@ -13,4 +13,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: DIETSelector
diff --git a/docker/configs/config_supervised_embeddings_duckling.yml b/docker/configs/config_supervised_embeddings_duckling.yml
index 3bb09d39765d..0637f14f706b 100644
--- a/docker/configs/config_supervised_embeddings_duckling.yml
+++ b/docker/configs/config_supervised_embeddings_duckling.yml
@@ -14,4 +14,3 @@ pipeline:
   - name: DIETSelector
   - name: DucklingHTTPExtractor
     url: "http://duckling:8000"
-  
\ No newline at end of file
diff --git a/docs/core/old-core-change-log.rst b/docs/core/old-core-change-log.rst
index 2ee3a48b5442..bda2dc5a065f 100644
--- a/docs/core/old-core-change-log.rst
+++ b/docs/core/old-core-change-log.rst
@@ -555,7 +555,7 @@ Added
 - intent confidence support in RegexInterpreter
 - added paramter to train script to pull training data from an url instead
   of a stories file
-- added new policy: :ref:`ted_policy` implemented in tensorflow
+- added new policy: :ref:`embedding_policy` implemented in tensorflow
 
 Changed
 -------
diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 83b636f60360..9393b862b923 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -208,7 +208,7 @@ Embedding Policy
 TED Policy
 ^^^^^^^^^^
 
-Transformer Embedding Dialogue Policy (TEDP)
+Transformer Embedding Dialogue (TED) Policy
 
 The policy used in our paper https://arxiv.org/abs/1910.00486.
 
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index ea02ded348a5..c66a07ef272f 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -27,7 +27,7 @@ General
   .. code-block:: yaml
 
     policies:
-    - ... # other policies
+    # - ... other policies
     - name: TEDPolicy
       max_history: 5
       epochs: 100
@@ -98,7 +98,7 @@ General
   .. code-block:: yaml
 
     pipeline:
-    - ... # other components
+    # - ... other components
     - name: LexicalSyntacticFeaturizer
       features: [
         ["low", "title", "upper"],
@@ -122,7 +122,7 @@ General
       entity_recognition: True
       use_masked_language_model: False
       number_of_transformer_layers: 0
-      ... # any other parameters
+      # ... any other parameters
 
   As you can see in the configuration, you need to add the ``LexicalSyntacticFeaturizer`` before the ``DIETClassifier``
   to your pipeline. ``CRFEntityExtractor`` featurizes user messages on its own, it does not depend on any featurizer.
@@ -138,14 +138,14 @@ General
   .. code-block:: yaml
 
     pipeline:
-    - ... # other components
+    # - ... other components
     - name: DIETSelector
       intent_classification: True
       entity_recognition: False
       use_masked_language_model: False
       BILOU_flag: False
       number_of_transformer_layers: 0
-      ... # any other parameters
+      # ... any other parameters
 
   See :ref:`diet-selector` for more information about the new component.
 

From b41ea6b611b58ceb45b5326da32aaef6401cbfb7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 10:04:10 +0100
Subject: [PATCH 466/633] add missing comma

---
 rasa/nlu/extractors/crf_entity_extractor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 36ea46d21a02..f3da1e3836ac 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -49,7 +49,8 @@ class CRFEntityExtractor(EntityExtractor):
             ["low", "title", "upper"],
             [
                 "low",
-                "bias" "prefix5",
+                "bias",
+                "prefix5",
                 "prefix2",
                 "suffix5",
                 "suffix3",

From 2dd3841fe8a0e647c787695991b1bb2d57e3a7b9 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 10:25:33 +0100
Subject: [PATCH 467/633] refactored model loading for convert

---
 .../dense_featurizer/convert_featurizer.py       | 10 ++++++++--
 rasa/nlu/tokenizers/convert_tokenizer.py         | 16 +++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 7b81cba6c094..733af0a88137 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -32,6 +32,12 @@ class ConveRTFeaturizer(Featurizer):
 
     required_components = [ConveRTTokenizer.name]
 
+    def _load_from_tfhub(self, model_url: Text):
+
+        import tensorflow_hub as tfhub
+
+        self.module = tfhub.load(model_url)
+
     def _load_model(self) -> None:
 
         # needed in order to load model
@@ -43,10 +49,10 @@ def _load_model(self) -> None:
         # required to take care of cases when other files are already
         # stored in the default TFHUB_CACHE_DIR
         try:
-            self.module = tfhub.load(model_url)
+            self._load_from_tfhub(model_url)
         except OSError:
             os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
-            self.module = tfhub.load(model_url)
+            self._load_from_tfhub(model_url)
 
         self.sentence_encoding_signature = self.module.signatures["default"]
         self.sequence_encoding_signature = self.module.signatures["encode_sequence"]
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 12c23b744d33..a1452452b98a 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -26,13 +26,19 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         super().__init__(component_config)
 
-        self._load_tokenizer_params()
+        self._load_model()
 
-    def _load_tokenizer_params(self):
+    def _load_from_tfhub(self, model_url: Text):
+
+        import tensorflow_hub as tfhub
+
+        self.module = tfhub.load(model_url)
+
+    def _load_model(self):
 
         # needed to load the ConveRT model
         import tensorflow_text
-        import tensorflow_hub as tfhub
+
         import os
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
@@ -40,10 +46,10 @@ def _load_tokenizer_params(self):
         # required to take care of cases when other files are already
         # stored in the default TFHUB_CACHE_DIR
         try:
-            self.module = tfhub.load(model_url)
+            self._load_from_tfhub(model_url)
         except OSError:
             os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
-            self.module = tfhub.load(model_url)
+            self._load_from_tfhub(model_url)
 
         self.tokenize_signature = self.module.signatures["tokenize"]
 

From b47a45d3dc5e28270991a48a0e79367134731bfe Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 10:34:48 +0100
Subject: [PATCH 468/633] add test for docker configs

---
 .../config_supervised_embeddings_duckling.yml |  1 +
 data/test_config/embedding_random_seed.yaml   |  1 +
 tests/test_train.py                           | 28 +++++++++++++++++--
 tests/utilities.py                            | 28 +++++++++++++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/data/test_config/config_supervised_embeddings_duckling.yml b/data/test_config/config_supervised_embeddings_duckling.yml
index c1771ea5addc..7b0635d8a124 100644
--- a/data/test_config/config_supervised_embeddings_duckling.yml
+++ b/data/test_config/config_supervised_embeddings_duckling.yml
@@ -3,5 +3,6 @@ language: "en"
 pipeline:
 - name: "CountVectorsFeaturizer"
 - name: "EmbeddingIntentClassifier"
+  epochs: 2
 - name: "DucklingHTTPExtractor"
   url: "http://duckling:8000"
diff --git a/data/test_config/embedding_random_seed.yaml b/data/test_config/embedding_random_seed.yaml
index 52748baddc77..53ee82926eb3 100644
--- a/data/test_config/embedding_random_seed.yaml
+++ b/data/test_config/embedding_random_seed.yaml
@@ -1,3 +1,4 @@
 policies:
 - name: TEDPolicy
   random_seed: 42
+  epochs: 2
diff --git a/tests/test_train.py b/tests/test_train.py
index af9dc02412f5..23a014341594 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -1,16 +1,15 @@
 import tempfile
 import os
-import shutil
 from typing import Text
 
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
-from _pytest.tmpdir import TempdirFactory
 
 import rasa.model
 
 from rasa.train import train_core, train_nlu, train
 from tests.core.test_model import _fingerprint
+from tests.utilities import update_number_of_epochs
 
 
 @pytest.mark.parametrize(
@@ -131,3 +130,28 @@ def test_train_nlu_temp_files(
     )
 
     assert count_temp_rasa_files(tempfile.tempdir) == 0
+
+
+def docker_config_files():
+    docker_config_path = "docker/configs"
+
+    return [
+        os.path.join(docker_config_path, f)
+        for f in os.listdir(docker_config_path)
+        if os.path.isfile(os.path.join(docker_config_path, f))
+    ]
+
+
+@pytest.mark.parametrize("config_file", docker_config_files())
+def test_train_docker_configs(
+    config_file: Text, tmp_path: Text, default_nlu_data: Text
+):
+    output = str(tmp_path)
+    tmp_config_file = os.path.join(output, "config.yml")
+
+    update_number_of_epochs(config_file, tmp_config_file)
+
+    train_nlu(tmp_config_file, default_nlu_data, output=output)
+
+    files = os.listdir(output)
+    assert any([f.startswith("nlu") and f.endswith("tar.gz") for f in files])
diff --git a/tests/utilities.py b/tests/utilities.py
index 6c334f75905e..a969f71f513a 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -1,5 +1,15 @@
+import tempfile
+
+from typing import Text
 from yarl import URL
 
+import rasa.utils.io as io_utils
+from nlu.classifiers.diet_classifier import DIETClassifier
+from nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from nlu.selectors.diet_selector import DIETSelector
+from nlu.selectors.response_selector import ResponseSelector
+from utils.tensorflow.constants import EPOCHS
+
 
 def latest_request(mocked, request_type, path):
     return mocked.requests.get((request_type, URL(path)))
@@ -7,3 +17,21 @@ def latest_request(mocked, request_type, path):
 
 def json_of_latest_request(r):
     return r[-1].kwargs["json"]
+
+
+def update_number_of_epochs(config_path: Text, output_file: Text):
+    config = io_utils.read_yaml_file(config_path)
+
+    if "pipeline" not in config.keys():
+        raise ValueError(f"Invalid config provided! File: '{config_path}'.")
+
+    for component in config["pipeline"]:
+        if component["name"] in [
+            EmbeddingIntentClassifier.name,
+            DIETClassifier.name,
+            ResponseSelector.name,
+            DIETSelector.name,
+        ]:
+            component[EPOCHS] = 2
+
+    io_utils.write_yaml_file(config, output_file)

From 1646fab49a739374ef6f4f105b9707bab7c11608 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 10:40:26 +0100
Subject: [PATCH 469/633] fix docs

---
 docs/nlu/components.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 5aa4d258f048..175143456f09 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1380,7 +1380,7 @@ CRFEntityExtractor
             # BILOU_flag determines whether to use BILOU tagging or not.
             # More rigorous however requires more examples per entity
             # rule of thumb: use only if more than 100 egs. per entity
-            "BILOU_flag": True,
+            "BILOU_flag": True
             # crf_features is [before, token, after] array with before, token,
             # after holding keys about which features to use for each token,
             # for example, 'title' in array before will have the feature
@@ -1403,13 +1403,13 @@ CRFEntityExtractor
                     "pattern",
                 ],
                 ["low", "title", "upper"],
-            ],
+            ]
             # The maximum number of iterations for optimization algorithms.
-            "max_iterations": 50,
+            "max_iterations": 50
             # weight of the L1 regularization
-            "L1_c": 0.1,
+            "L1_c": 0.1
             # weight of the L2 regularization
-            "L2_c": 0.1,
+            "L2_c": 0.1
 
 .. _DucklingHTTPExtractor:
 

From 8109eedeb369bb96bf3a21b6aa50ac63b20d749f Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 10:42:08 +0100
Subject: [PATCH 470/633] added class descriptions

---
 .../dense_featurizer/convert_featurizer.py           | 11 +++++++++--
 .../featurizers/dense_featurizer/lm_featurizer.py    |  5 +++++
 rasa/nlu/tokenizers/convert_tokenizer.py             |  6 ++++++
 rasa/nlu/tokenizers/lm_tokenizer.py                  |  5 +++++
 rasa/nlu/utils/hugging_face/hf_transformers.py       | 12 ++++++++----
 5 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 733af0a88137..bef52ef5ed5d 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -16,6 +16,7 @@
 )
 import numpy as np
 import tensorflow as tf
+import os
 
 from rasa.utils.common import raise_warning
 
@@ -23,6 +24,12 @@
 
 
 class ConveRTFeaturizer(Featurizer):
+    """Featurizer using ConveRT model.
+
+        Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
+        model from TFHub and computes sentence and sequence level feature representations
+        for dense featurizable attributes of each message object.
+    """
 
     provides = [
         DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
@@ -33,17 +40,17 @@ class ConveRTFeaturizer(Featurizer):
     required_components = [ConveRTTokenizer.name]
 
     def _load_from_tfhub(self, model_url: Text):
+        """Load model from TFHub"""
 
         import tensorflow_hub as tfhub
 
         self.module = tfhub.load(model_url)
 
     def _load_model(self) -> None:
+        """Load model from cache if possible, otherwise from TFHub"""
 
         # needed in order to load model
         import tensorflow_text
-        import tensorflow_hub as tfhub
-        import os
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
         # required to take care of cases when other files are already
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 3a8e152c6773..ce8d76073750 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -19,6 +19,11 @@
 
 
 class LanguageModelFeaturizer(Featurizer):
+    """Featurizer using transformer based language models.
+
+        Uses the output of HFTransformersNLP component to set the sequence and sentence
+        level representations for dense featurizable attributes of each message object.
+    """
 
     provides = [
         DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index a1452452b98a..383317b33b96 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -9,6 +9,12 @@
 
 
 class ConveRTTokenizer(WhitespaceTokenizer):
+    """Tokenizer using ConveRT model.
+
+        Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
+        model from TFHub and computes sub-word tokens for dense
+        featurizable attributes of each message object.
+    """
 
     provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index 7fcbe5aca3d0..4c5faeb89dfe 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -14,6 +14,11 @@
 
 
 class LanguageModelTokenizer(Tokenizer):
+    """Tokenizer using transformer based language models.
+
+        Uses the output of HFTransformersNLP component to set the tokens
+        for dense featurizable attributes of each message object.
+    """
 
     provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 5c52c8b4cde0..70a577ccfec0 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -31,6 +31,14 @@
 
 
 class HFTransformersNLP(Component):
+    """Utility Component for interfacing between Transformers library.
+
+        The transformers(https://github.com/huggingface/transformers) library
+        is used to load pre-trained language models like BERT, GPT-2, etc.
+        The component also tokenizes and featurizes dense featurizable attributes of each
+        message.
+    """
+
     provides = [
         LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
@@ -55,10 +63,6 @@ def _load_model(self) -> None:
         self.model_name = self.component_config["model_name"]
 
         if self.model_name not in model_class_dict:
-            logger.error(
-                f"'{self.model_name}' not a valid model name. Choose from {str(list(model_class_dict.keys()))} or create"
-                f"a new class inheriting from this class to support your model."
-            )
             raise KeyError(
                 f"'{self.model_name}' not a valid model name. Choose from {str(list(model_class_dict.keys()))}or create"
                 f"a new class inheriting from this class to support your model."

From 2df9c9aee323478bb63b437cbcd297e7c29ba71f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 11:00:34 +0100
Subject: [PATCH 471/633] remove colon

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index bef52ef5ed5d..356b6af8ca8e 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -211,7 +211,7 @@ def train(
 
             pbar = tqdm(
                 range(0, len(non_empty_examples), batch_size),
-                desc=attribute.capitalize() + " batches:",
+                desc=attribute.capitalize() + " batches",
             )
             for batch_start_index in pbar:
                 batch_end_index = min(

From b51c020a000a9fe184d5eb962d1ad1b54b42ed82 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 11:15:57 +0100
Subject: [PATCH 472/633] suppress logging statement for tensorflow version
 from transformers

---
 rasa/nlu/utils/hugging_face/registry.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
index 2c52c8109266..a6d68cde8747 100644
--- a/rasa/nlu/utils/hugging_face/registry.py
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -1,3 +1,9 @@
+import logging
+
+# Explicitly set logging level for this module before any import
+# because otherwise it logs tensorflow/pytorch versions
+logging.getLogger("transformers.file_utils").setLevel(logging.WARNING)
+
 from transformers import (
     TFBertModel,
     TFOpenAIGPTModel,

From 0fb940d4408d6ad2e251e178b3a58967d3af12d6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 11:29:56 +0100
Subject: [PATCH 473/633] use configs from files in docs

---
 changelog/5266.feature.rst                    |   2 +-
 data/configs_for_docs/default_config.yml      |  14 ++
 .../default_english_config.yml                |  15 ++
 ...pretrained_embeddings_convert_config_1.yml |   3 +
 ...pretrained_embeddings_convert_config_2.yml |   6 +
 .../pretrained_embeddings_mitie_config_1.yml  |  11 ++
 .../pretrained_embeddings_mitie_config_2.yml  |  10 ++
 .../pretrained_embeddings_spacy_config_1.yml  |   3 +
 .../pretrained_embeddings_spacy_config_2.yml  |  10 ++
 .../supervised_embeddings_config_1.yml        |   3 +
 .../supervised_embeddings_config_2.yml        |  13 ++
 docs/nlu/choosing-a-pipeline.rst              | 149 ++++--------------
 tests/test_train.py                           |  17 +-
 tests/utilities.py                            |   6 +-
 14 files changed, 130 insertions(+), 132 deletions(-)
 create mode 100644 data/configs_for_docs/default_config.yml
 create mode 100644 data/configs_for_docs/default_english_config.yml
 create mode 100644 data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
 create mode 100644 data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
 create mode 100644 data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
 create mode 100644 data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
 create mode 100644 data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
 create mode 100644 data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
 create mode 100644 data/configs_for_docs/supervised_embeddings_config_1.yml
 create mode 100644 data/configs_for_docs/supervised_embeddings_config_2.yml

diff --git a/changelog/5266.feature.rst b/changelog/5266.feature.rst
index 61b6d5576303..3ad91215a146 100644
--- a/changelog/5266.feature.rst
+++ b/changelog/5266.feature.rst
@@ -1,4 +1,4 @@
-Added a new NLU components ``DIETClassifier`` and a new policy ``TEDPolicy``.
+Added a new NLU component ``DIETClassifier`` and a new policy ``TEDPolicy``.
 
 DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
 recognition. You can read more about this component in our
diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
new file mode 100644
index 000000000000..0f3cdbf5e593
--- /dev/null
+++ b/data/configs_for_docs/default_config.yml
@@ -0,0 +1,14 @@
+language: "en"
+
+pipeline:
+  - name: WhitespaceTokenizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
\ No newline at end of file
diff --git a/data/configs_for_docs/default_english_config.yml b/data/configs_for_docs/default_english_config.yml
new file mode 100644
index 000000000000..1f264cc796ed
--- /dev/null
+++ b/data/configs_for_docs/default_english_config.yml
@@ -0,0 +1,15 @@
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
\ No newline at end of file
diff --git a/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml b/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
new file mode 100644
index 000000000000..f64e16fdded5
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "pretrained_embeddings_convert"
\ No newline at end of file
diff --git a/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml b/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
new file mode 100644
index 000000000000..8f25d3a60eae
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
@@ -0,0 +1,6 @@
+language: "en"
+
+pipeline:
+- name: "ConveRTTokenizer"
+- name: "ConveRTFeaturizer"
+- name: "EmbeddingIntentClassifier"
\ No newline at end of file
diff --git a/data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml b/data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
new file mode 100644
index 000000000000..d1b8b86dd953
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
@@ -0,0 +1,11 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  model: "data/total_word_feature_extractor.dat"
+- name: "MitieTokenizer"
+- name: "MitieEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "RegexFeaturizer"
+- name: "MitieFeaturizer"
+- name: "SklearnIntentClassifier"
diff --git a/data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml b/data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
new file mode 100644
index 000000000000..356eb898e812
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
@@ -0,0 +1,10 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  model: "data/total_word_feature_extractor.dat"
+- name: "MitieTokenizer"
+- name: "MitieEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "RegexFeaturizer"
+- name: "MitieIntentClassifier"
diff --git a/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml b/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
new file mode 100644
index 000000000000..33a92d02a4af
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "pretrained_embeddings_spacy"
\ No newline at end of file
diff --git a/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml b/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
new file mode 100644
index 000000000000..a87aad8163b4
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
@@ -0,0 +1,10 @@
+language: "en"
+
+pipeline:
+- name: "SpacyNLP"
+- name: "SpacyTokenizer"
+- name: "SpacyFeaturizer"
+- name: "RegexFeaturizer"
+- name: "CRFEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "SklearnIntentClassifier"
\ No newline at end of file
diff --git a/data/configs_for_docs/supervised_embeddings_config_1.yml b/data/configs_for_docs/supervised_embeddings_config_1.yml
new file mode 100644
index 000000000000..8ef74b4bcf93
--- /dev/null
+++ b/data/configs_for_docs/supervised_embeddings_config_1.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "supervised_embeddings"
\ No newline at end of file
diff --git a/data/configs_for_docs/supervised_embeddings_config_2.yml b/data/configs_for_docs/supervised_embeddings_config_2.yml
new file mode 100644
index 000000000000..2bcf54835cc2
--- /dev/null
+++ b/data/configs_for_docs/supervised_embeddings_config_2.yml
@@ -0,0 +1,13 @@
+language: "en"
+
+pipeline:
+- name: "WhitespaceTokenizer"
+- name: "RegexFeaturizer"
+- name: "CRFEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "CountVectorsFeaturizer"
+- name: "CountVectorsFeaturizer"
+  analyzer: "char_wb"
+  min_ngram: 1
+  max_ngram: 4
+- name: "EmbeddingIntentClassifier"
\ No newline at end of file
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index acd92025b882..93e773c4bbae 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -23,44 +23,14 @@ The Short Answer
 
 If your training data is in english, a good starting point is the following pipeline:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline:
-      - name: ConveRTTokenizer
-      - name: ConveRTFeaturizer
-      - name: RegexFeaturizer
-      - name: LexicalSyntacticFeaturizer
-      - name: CountVectorsFeaturizer
-      - name: CountVectorsFeaturizer
-        analyzer: "char_wb"
-        min_ngram: 1
-        max_ngram: 4
-      - name: DIETClassifier
-      - name: EntitySynonymMapper
-      - name: DIETSelector
-
+.. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
+    :language: yaml
 
 In case your training data is multi-lingual and is rich with domain specific vocabulary,
 use the following pipeline:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline:
-      - name: WhitespaceTokenizer
-      - name: RegexFeaturizer
-      - name: LexicalSyntacticFeaturizer
-      - name: CountVectorsFeaturizer
-      - name: CountVectorsFeaturizer
-        analyzer: "char_wb"
-        min_ngram: 1
-        max_ngram: 4
-      - name: DIETClassifier
-      - name: EntitySynonymMapper
-      - name: DIETSelector
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+    :language: yaml
 
 
 A Longer Answer
@@ -69,20 +39,8 @@ A Longer Answer
 We encourage everyone to define their own pipeline by listing the names of the components you want to use.
 For example:
 
-.. code-block:: yaml
-
-    pipeline:
-      - name: WhitespaceTokenizer
-      - name: RegexFeaturizer
-      - name: LexicalSyntacticFeaturizer
-      - name: CountVectorsFeaturizer
-      - name: CountVectorsFeaturizer
-        analyzer: "char_wb"
-        min_ngram: 1
-        max_ngram: 4
-      - name: DIETClassifier
-      - name: EntitySynonymMapper
-      - name: DIETSelector
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+    :language: yaml
 
 You can find the details of each component in :ref:`components`.
 If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
@@ -125,22 +83,23 @@ We support a few components that provide pre-trained word embeddings:
 3. ``ConveRTFeaturizer``
 4. ``LanguageModelFeaturizer``
 
+If your training data is in English, we recommend to use the ``ConveRTFeaturizer``.
 The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence. For example, if you
 have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
 my place", since the contextual vector representation for both examples are already very similar, the intent classified
 for both is highly likely to be the same. This is also useful if you don't have large enough training data.
 
-TODO when to use what featurizer
+``SpacyFeaturizer`` provides word embeddings in many different language (see :ref:`pretrained-word-vectors`).
+So in case, your training data is not in Enlgish you might want to use this featurizer.
 
 Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Depending on your data you may want to only perform intent classification or just entity recognition.
+Depending on your data you may want to only perform intent classification or entity recognition.
 We support several components for each of the task. All of them are listed in :ref:`components`.
-We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`response-selector`
+We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`diet-selector`
 for response selection.
 
-
 Comparing different pipelines for your data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -170,7 +129,7 @@ Balanced batching is used by default. In order to turn it off and use a classic
     language: "en"
 
     pipeline:
-    - ... # other components
+    # - ... other components
     - name: "DIETClassifier"
       batch_strategy: sequence
 
@@ -240,22 +199,8 @@ the processing has finished. For example, for the sentence ``"I am looking for C
 
 This is created as a combination of the results of the different components in the following pipeline:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline:
-      - name: WhitespaceTokenizer
-      - name: RegexFeaturizer
-      - name: LexicalSyntacticFeaturizer
-      - name: CountVectorsFeaturizer
-      - name: CountVectorsFeaturizer
-        analyzer: "char_wb"
-        min_ngram: 1
-        max_ngram: 4
-      - name: DIETClassifier
-      - name: EntitySynonymMapper
-      - name: DIETSelector
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+    :language: yaml
 
 For example, the ``entities`` attribute is created by the ``DIETClassifier`` component.
 
@@ -371,27 +316,14 @@ if you don't have large enough training data.
 
 To use the ``pretrained_embeddings_spacy`` template, use the following configuration:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline: "pretrained_embeddings_spacy"
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
+    :language: yaml
 
 See :ref:`pretrained-word-vectors` for more information about loading spacy language models.
 To use the components and configure them separately:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline:
-    - name: "SpacyNLP"
-    - name: "SpacyTokenizer"
-    - name: "SpacyFeaturizer"
-    - name: "RegexFeaturizer"
-    - name: "CRFEntityExtractor"
-    - name: "EntitySynonymMapper"
-    - name: "SklearnIntentClassifier"
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
+    :language: yaml
 
 .. _section_pretrained_embeddings_convert_pipeline:
 
@@ -417,22 +349,13 @@ for both is highly likely to be the same. This is also useful if you don't have
 
 To use the ``pretrained_embeddings_convert`` template:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline: "pretrained_embeddings_convert"
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
+    :language: yaml
 
 To use the components and configure them separately:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline:
-    - name: "ConveRTTokenizer"
-    - name: "ConveRTFeaturizer"
-    - name: "EmbeddingIntentClassifier"
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
+    :language: yaml
 
 .. _section_supervised_embeddings_pipeline:
 
@@ -450,31 +373,15 @@ You can read more about this topic `here <https://medium.com/rasa-blog/supervise
 To train a Rasa model in your preferred language, define the
 ``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline: "supervised_embeddings"
+.. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_1.yml
+    :language: yaml
 
 The ``supervised_embeddings`` pipeline supports any language that can be tokenized.  By default it uses whitespace
 for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are the default
 components that make up the ``supervised_embeddings`` pipeline:
 
-.. code-block:: yaml
-
-    language: "en"
-
-    pipeline:
-    - name: "WhitespaceTokenizer"
-    - name: "RegexFeaturizer"
-    - name: "CRFEntityExtractor"
-    - name: "EntitySynonymMapper"
-    - name: "CountVectorsFeaturizer"
-    - name: "CountVectorsFeaturizer"
-      analyzer: "char_wb"
-      min_ngram: 1
-      max_ngram: 4
-    - name: "EmbeddingIntentClassifier"
+.. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_2.yml
+    :language: yaml
     
 So for example, if your chosen language is not whitespace-tokenized (words are not separated by spaces), you
 can replace the ``WhitespaceTokenizer`` with your own tokenizer. We support a number of different :ref:`tokenizers <tokenizers>`,
@@ -500,11 +407,11 @@ However, we do not recommend that you use it as mitie support is likely to be de
 To use the MITIE pipeline, you will have to train word vectors from a corpus. Instructions can be found
 :ref:`here <mitie>`. This will give you the file path to pass to the ``model`` parameter.
 
-.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_mitie.yml
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
     :language: yaml
 
 Another version of this pipeline uses MITIE's featurizer and also its multi-class classifier.
 Training can be quite slow, so this is not recommended for large datasets.
 
-.. literalinclude:: ../../data/test_config/config_pretrained_embeddings_mitie_2.yml
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
     :language: yaml
\ No newline at end of file
diff --git a/tests/test_train.py b/tests/test_train.py
index 23a014341594..d0bb3194ff39 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -132,18 +132,19 @@ def test_train_nlu_temp_files(
     assert count_temp_rasa_files(tempfile.tempdir) == 0
 
 
-def docker_config_files():
-    docker_config_path = "docker/configs"
-
+def config_files_in(config_directory: Text):
     return [
-        os.path.join(docker_config_path, f)
-        for f in os.listdir(docker_config_path)
-        if os.path.isfile(os.path.join(docker_config_path, f))
+        os.path.join(config_directory, f)
+        for f in os.listdir(config_directory)
+        if os.path.isfile(os.path.join(config_directory, f))
     ]
 
 
-@pytest.mark.parametrize("config_file", docker_config_files())
-def test_train_docker_configs(
+@pytest.mark.parametrize(
+    "config_file",
+    config_files_in("data/configs_for_docs") + config_files_in("docker/configs"),
+)
+def test_train_docker_and_docs_configs(
     config_file: Text, tmp_path: Text, default_nlu_data: Text
 ):
     output = str(tmp_path)
diff --git a/tests/utilities.py b/tests/utilities.py
index a969f71f513a..f776052ac9be 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -1,5 +1,3 @@
-import tempfile
-
 from typing import Text
 from yarl import URL
 
@@ -26,6 +24,10 @@ def update_number_of_epochs(config_path: Text, output_file: Text):
         raise ValueError(f"Invalid config provided! File: '{config_path}'.")
 
     for component in config["pipeline"]:
+        # do not update epochs for pipeline templates
+        if not isinstance(component, dict):
+            continue
+
         if component["name"] in [
             EmbeddingIntentClassifier.name,
             DIETClassifier.name,

From 8199fcf62b9e68e7607d4f25f010e0bf4aa74454 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 11:35:16 +0100
Subject: [PATCH 474/633] review comments

---
 changelog/5187.feature.rst    | 10 +++++-----
 changelog/5230.feature.rst    |  2 +-
 docs/api/tensorflow_usage.rst |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/changelog/5187.feature.rst b/changelog/5187.feature.rst
index 41f348c240e8..8822d42feecc 100644
--- a/changelog/5187.feature.rst
+++ b/changelog/5187.feature.rst
@@ -1,7 +1,7 @@
-Integrate language models from HuggingFace's Transformers Library.
+Integrate language models from HuggingFace's `Transformers <https://github.com/huggingface/transformers>`_ Library.
 
-Add a new NLP component ``HFTransformersNLP`` which tokenizes and featurizes incoming messages using a specified
-pre-trained model with the Transformers library as the backend.
-Add ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from HFTransformersNLP and
-sets them correctly for message object.
+Add a new NLP component `HFTransformersNLP <https://rasa.com/docs/rasa/nlu/components.html#hftransformersnlp>`_ which
+tokenizes and featurizes incoming messages using a specified pre-trained model with the Transformers library as the backend.
+Add ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from ``HFTransformersNLP``
+and sets them correctly for message object.
 Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa
diff --git a/changelog/5230.feature.rst b/changelog/5230.feature.rst
index 5aeae8295c2e..89f2f92941aa 100644
--- a/changelog/5230.feature.rst
+++ b/changelog/5230.feature.rst
@@ -1,4 +1,4 @@
-Refactor how GPU and CPU environments are configured for TensorFlow 2.0
+Refactor how GPU and CPU environments are configured for TensorFlow 2.0.
 
 Please refer to the `documentation <https://rasa.com/docs/rasa/api/tensorflow_usage.html>`_ to understand
 which environment variables to set in what scenarios. A couple of examples are shown below as well:
diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
index ef6786b294b3..e45b0fde3617 100644
--- a/docs/api/tensorflow_usage.rst
+++ b/docs/api/tensorflow_usage.rst
@@ -1,4 +1,4 @@
-:desc: Find out how to configure your environment for efficient usage of TensorFlow inside Rasa Open Source
+:desc: Find out how to configure your environment for efficient usage of TensorFlow inside Rasa Open Source.
 
 .. _tensorflow_usage:
 

From f297015f17b6f1be08f3acef5e9ee899fce8768b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 11:51:43 +0100
Subject: [PATCH 475/633] add links to pipeline docs

---
 docs/nlu/choosing-a-pipeline.rst |  8 ++++----
 docs/nlu/components.rst          | 11 +++++++++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 93e773c4bbae..ddabb61c8776 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -78,10 +78,10 @@ The advantage of using pre-trained word embeddings in your pipeline is that if y
 words "apples" and "pears" are very similar. This is especially useful if you don't have large enough training data.
 We support a few components that provide pre-trained word embeddings:
 
-1. ``MitieFeaturizer``
-2. ``SpacyFeaturizer``
-3. ``ConveRTFeaturizer``
-4. ``LanguageModelFeaturizer``
+1. :ref:`MitieFeaturizer`
+2. :ref:`SpacyFeaturizer`
+3. :ref:`ConveRTFeaturizer`
+4. :ref:`LanguageModelFeaturizer`
 
 If your training data is in English, we recommend to use the ``ConveRTFeaturizer``.
 The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index bc57f38e3143..9b3da64f794f 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -248,11 +248,11 @@ SpacyTokenizer
 ConveRTTokenizer
 ~~~~~~~~~~~~~~~~
 
-:Short: Tokenizer using ConveRT
+:Short: Tokenizer using `ConveRT <https://github.com/PolyAI-LDN/polyai-models#convert>`_
 :Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
 :Requires: Nothing
 :Description:
-    Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
+    Creates tokens using the ConveRT tokenizer. Must be used whenever the :ref:`ConveRTFeaturizer` is used.
 :Configuration:
     Make the tokenizer not case sensitive by adding the ``case_sensitive: False`` option.
     Default being ``case_sensitive: True``.
@@ -307,6 +307,9 @@ However, the additional token at the end (e.g. ``__CLS__``) contains features fo
 This feature vector can be used in any non-sequence model.
 The corresponding classifier can therefore decide what kind of features to use.
 
+
+.. _MitieFeaturizer:
+
 MitieFeaturizer
 ~~~~~~~~~~~~~~~
 
@@ -337,6 +340,8 @@ MitieFeaturizer
           "pooling": "mean"
 
 
+.. _SpacyFeaturizer:
+
 SpacyFeaturizer
 ~~~~~~~~~~~~~~~
 
@@ -362,6 +367,8 @@ SpacyFeaturizer
           "pooling": "mean"
 
 
+.. _ConveRTFeaturizer:
+
 ConveRTFeaturizer
 ~~~~~~~~~~~~~~~~~
 

From 9822b315289d39144bb76468ae113db27279cfc2 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 12:27:11 +0100
Subject: [PATCH 476/633] added language model specific info to docs

---
 docs/nlu/choosing-a-pipeline.rst | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index ddabb61c8776..1b437a25a224 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -90,8 +90,19 @@ have a training example, like: "can I book a car?", and Rasa is asked to predict
 my place", since the contextual vector representation for both examples are already very similar, the intent classified
 for both is highly likely to be the same. This is also useful if you don't have large enough training data.
 
-``SpacyFeaturizer`` provides word embeddings in many different language (see :ref:`pretrained-word-vectors`).
-So in case, your training data is not in Enlgish you might want to use this featurizer.
+An alternative to ``ConveRTFeaturizer`` can be ``LanguageModelFeaturizer`` which uses pre-trained language models such as
+BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See :ref:`HFTransformersNLP`
+for a full list of supported language models.
+
+In case, your training data is not in English you can also use a different variant of a language model which
+is pre-trained in the language specific to your training data. For example, there is a chinese language variant of
+BERT(``bert-base-chinese``) or a japanese variant of it(``bert-base-japanese``). A full list of different variants of these
+language models is available in the
+`official docs of Transformers library <https://huggingface.co/transformers/pretrained_models.html>_`
+
+``SpacyFeaturizer`` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`).
+So, this featurizer can also be an alternate option depending on the language of your training data.
+
 
 Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 5fb1e098e498535436c148dd49e1a2727f71eca6 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 12:28:09 +0100
Subject: [PATCH 477/633] fix typo

---
 docs/nlu/choosing-a-pipeline.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 1b437a25a224..174b3beb018d 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -48,7 +48,7 @@ If you want to use custom components in your pipeline, see :ref:`custom-nlu-comp
 A pipeline usually consist of three main parts:
 
     1. Tokenizaion
-    2. Featuirzation
+    2. Featurization
     3. Entity Recognition / Intent Classification / Response Selectors
 
 Tokenization

From 357f2c4ee6842f54caf533794fd5676e97350909 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 12:31:30 +0100
Subject: [PATCH 478/633] make sparsity configurable, Response selector is a
 subclass of diet selector

---
 rasa/core/policies/embedding_policy.py  |  6 ++---
 rasa/core/policies/ted_policy.py        | 10 +++++----
 rasa/nlu/classifiers/diet_classifier.py |  9 ++++----
 rasa/nlu/selectors/diet_selector.py     |  6 ++---
 rasa/nlu/selectors/response_selector.py |  7 ++++--
 rasa/utils/tensorflow/constants.py      |  3 +--
 rasa/utils/tensorflow/layers.py         |  4 +++-
 rasa/utils/tensorflow/transformer.py    | 29 ++++++++++++++++++-------
 rasa/utils/train_utils.py               |  2 --
 9 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index a500abca6f48..ee2828f6c60a 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -11,7 +11,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -32,6 +31,7 @@
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     DROPRATE_ATTENTION,
+    WEIGHTS_SPARSITY,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -58,8 +58,6 @@ class EmbeddingPolicy(TEDPolicy):
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
-        # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # training parameters
@@ -106,6 +104,8 @@ class EmbeddingPolicy(TEDPolicy):
         DROPRATE_LABEL: 0.0,
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
+        # sparsity of the weights in dense layers
+        WEIGHTS_SPARSITY: 0.8,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 9d65e00901e7..947bcf0ca149 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -31,7 +31,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -52,6 +51,7 @@
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     DROPRATE_ATTENTION,
+    WEIGHTS_SPARSITY,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -79,8 +79,6 @@ class TEDPolicy(Policy):
         TRANSFORMER_SIZE: 128,
         # number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
-        # max sequence length
-        MAX_SEQ_LENGTH: 256,
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # if true use key relative embeddings in attention
@@ -133,6 +131,8 @@ class TEDPolicy(Policy):
         DROPRATE_LABEL: 0.0,
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
+        # sparsity of the weights in dense layers
+        WEIGHTS_SPARSITY: 0.8,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -514,12 +514,14 @@ def _prepare_layers(self) -> None:
             self.config[HIDDEN_LAYERS_SIZES][DIALOGUE],
             self.config[DROPRATE_DIALOGUE],
             self.config[REGULARIZATION_CONSTANT],
+            self.config[WEIGHTS_SPARSITY],
             layer_name_suffix=DIALOGUE,
         )
         self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][LABEL],
             self.config[DROPRATE_LABEL],
             self.config[REGULARIZATION_CONSTANT],
+            self.config[WEIGHTS_SPARSITY],
             layer_name_suffix=LABEL,
         )
         self._tf_layers["transformer"] = TransformerEncoder(
@@ -527,10 +529,10 @@ def _prepare_layers(self) -> None:
             self.config[TRANSFORMER_SIZE],
             self.config[NUM_HEADS],
             self.config[TRANSFORMER_SIZE] * 4,
-            self.config[MAX_SEQ_LENGTH],
             self.config[REGULARIZATION_CONSTANT],
             dropout_rate=self.config[DROPRATE_DIALOGUE],
             attention_dropout_rate=self.config[DROPRATE_ATTENTION],
+            sparsity=self.config[WEIGHTS_SPARSITY],
             unidirectional=True,
             use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
             use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 935c8356f7ca..1f720e07a22d 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -41,7 +41,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -61,6 +60,7 @@
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
     DROPRATE_ATTENTION,
+    WEIGHTS_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -122,8 +122,6 @@ class DIETClassifier(EntityExtractor):
         VALUE_RELATIVE_ATTENTION: False,
         # max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
-        # max sequence length
-        MAX_SEQ_LENGTH: 256,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # training parameters
@@ -171,6 +169,8 @@ class DIETClassifier(EntityExtractor):
         DROPRATE: 0.2,
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
+        # sparsity of the weights in dense layers
+        WEIGHTS_SPARSITY: 0.8,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -1049,6 +1049,7 @@ def _prepare_input_layers(self, name: Text) -> None:
             self.config[HIDDEN_LAYERS_SIZES][name],
             self.config[DROPRATE],
             self.config[REGULARIZATION_CONSTANT],
+            self.config[WEIGHTS_SPARSITY],
             name,
         )
 
@@ -1061,10 +1062,10 @@ def _prepare_sequence_layers(self, name: Text) -> None:
                 self.config[TRANSFORMER_SIZE],
                 self.config[NUM_HEADS],
                 self.config[TRANSFORMER_SIZE] * 4,
-                self.config[MAX_SEQ_LENGTH],
                 self.config[REGULARIZATION_CONSTANT],
                 dropout_rate=self.config[DROPRATE],
                 attention_dropout_rate=self.config[DROPRATE_ATTENTION],
+                sparsity=self.config[WEIGHTS_SPARSITY],
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
                 use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
                 use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
index a6e691a7c8cb..f243dc26d04a 100644
--- a/rasa/nlu/selectors/diet_selector.py
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -15,7 +15,6 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -35,6 +34,7 @@
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
     DROPRATE_ATTENTION,
+    WEIGHTS_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -103,8 +103,6 @@ class DIETSelector(DIETClassifier):
         NUM_TRANSFORMER_LAYERS: 0,
         # number of attention heads in transformer
         NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -151,6 +149,8 @@ class DIETSelector(DIETClassifier):
         DROPRATE: 0.2,
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
+        # sparsity of the weights in dense layers
+        WEIGHTS_SPARSITY: 0.8,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 5df777521d79..f11ededeec92 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, Optional, Text
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.selectors.diet_selector import DIETSelector
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     LABEL,
@@ -27,6 +27,7 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     DROPRATE,
+    WEIGHTS_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -52,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ResponseSelector(DIETClassifier):
+class ResponseSelector(DIETSelector):
     """Response selector using supervised embeddings.
 
     The response selector embeds user inputs
@@ -132,6 +133,8 @@ class ResponseSelector(DIETClassifier):
         NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
+        # sparsity of the weights in dense layers
+        WEIGHTS_SPARSITY: 0.8,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index bbeef0c6539a..f49f6dc6bcf4 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -12,8 +12,6 @@
 VALUE_RELATIVE_ATTENTION = "use_value_relative_attention"
 MAX_RELATIVE_POSITION = "max_relative_position"
 
-MAX_SEQ_LENGTH = "maximum_sequence_length"
-
 BATCH_SIZES = "batch_size"
 BATCH_STRATEGY = "batch_strategy"
 EPOCHS = "epochs"
@@ -37,6 +35,7 @@
 DROPRATE_ATTENTION = "droprate_attention"
 DROPRATE_DIALOGUE = "droprate_dialogue"
 DROPRATE_LABEL = "droprate_label"
+WEIGHTS_SPARSITY = "weights_sparsity"
 
 EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
 EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index a2bf2e192e2d..08426cd26236 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -66,7 +66,7 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
 
 
 class DenseWithSparseWeights(tf.keras.layers.Dense):
-    def __init__(self, sparsity: int = 0.8, **kwargs) -> None:
+    def __init__(self, sparsity: float = 0.8, **kwargs) -> None:
         super().__init__(**kwargs)
         self.sparsity = sparsity
 
@@ -95,6 +95,7 @@ def __init__(
         layer_sizes: List[int],
         dropout_rate: float,
         reg_lambda: float,
+        sparsity: float,
         layer_name_suffix: Text,
     ) -> None:
         super().__init__(name=f"ffnn_{layer_name_suffix}")
@@ -105,6 +106,7 @@ def __init__(
             self._ffn_layers.append(
                 DenseWithSparseWeights(
                     units=layer_size,
+                    sparsity=sparsity,
                     activation=tfa.activations.gelu,
                     kernel_regularizer=l2_regularizer,
                     name=f"hidden_layer_{layer_name_suffix}_{i}",
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 8dfde8a63771..8398821363bd 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -15,6 +15,7 @@ def __init__(
         units: int,
         num_heads: int,
         attention_dropout_rate: float = 0.0,
+        sparsity: float = 0.8,
         unidirectional: bool = False,
         use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
@@ -42,11 +43,17 @@ def __init__(
 
         self._depth = units // self.num_heads
 
-        self._wq = DenseWithSparseWeights(units=units, use_bias=False)
-        self._wk = DenseWithSparseWeights(units=units, use_bias=False)
-        self._wv = DenseWithSparseWeights(units=units, use_bias=False)
+        self._wq = DenseWithSparseWeights(
+            units=units, use_bias=False, sparsity=sparsity
+        )
+        self._wk = DenseWithSparseWeights(
+            units=units, use_bias=False, sparsity=sparsity
+        )
+        self._wv = DenseWithSparseWeights(
+            units=units, use_bias=False, sparsity=sparsity
+        )
 
-        self._dense = DenseWithSparseWeights(units=units)
+        self._dense = DenseWithSparseWeights(units=units, sparsity=sparsity)
 
         self._create_relative_embeddings()
 
@@ -301,6 +308,7 @@ def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
             x, (tf.shape(x)[0], -1, self.units)
         )  # (batch_size, seq_len_q, units)
 
+    # noinspection PyMethodOverriding
     def call(
         self,
         v: tf.Tensor,
@@ -340,6 +348,7 @@ def __init__(
         filter_units: int,
         dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
+        sparsity: float = 0.8,
         unidirectional: bool = False,
         use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
@@ -353,6 +362,7 @@ def __init__(
             units,
             num_heads,
             attention_dropout_rate,
+            sparsity,
             unidirectional,
             use_key_relative_position,
             use_value_relative_position,
@@ -364,10 +374,12 @@ def __init__(
         self._ffn_layers = [
             tf.keras.layers.LayerNormalization(epsilon=1e-6),
             DenseWithSparseWeights(
-                units=filter_units, activation=tfa.activations.gelu
+                units=filter_units, activation=tfa.activations.gelu, sparsity=sparsity
             ),  # (batch_size, seq_len, filter_units)
             tf.keras.layers.Dropout(dropout_rate),
-            DenseWithSparseWeights(units=units),  # (batch_size, seq_len, units)
+            DenseWithSparseWeights(
+                units=units, sparsity=sparsity
+            ),  # (batch_size, seq_len, units)
             tf.keras.layers.Dropout(dropout_rate),
         ]
 
@@ -402,10 +414,10 @@ def __init__(
         units: int,
         num_heads: int,
         filter_units: int,
-        max_seq_length: int,
         reg_lambda: float,
         dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
+        sparsity: float = 0.8,
         unidirectional: bool = False,
         use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
@@ -420,7 +432,7 @@ def __init__(
 
         l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._embedding = DenseWithSparseWeights(
-            units=units, kernel_regularizer=l2_regularizer
+            units=units, kernel_regularizer=l2_regularizer, sparsity=sparsity
         )
         # positional encoding helpers
         self._angles = self._get_angles()
@@ -436,6 +448,7 @@ def __init__(
                 filter_units,
                 dropout_rate,
                 attention_dropout_rate,
+                sparsity,
                 unidirectional,
                 use_key_relative_position,
                 use_value_relative_position,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 0d68f6a9744d..4850b5bb9fdc 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -13,7 +13,6 @@
     HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
     DENSE_DIM,
     LOSS_TYPE,
     SIMILARITY_TYPE,
@@ -155,7 +154,6 @@ def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
         "num_transformer_layers", NUM_TRANSFORMER_LAYERS, config
     )
     config = _replace_deprecated_option("num_heads", NUM_HEADS, config)
-    config = _replace_deprecated_option("max_seq_length", MAX_SEQ_LENGTH, config)
     config = _replace_deprecated_option("dense_dim", DENSE_DIM, config)
     config = _replace_deprecated_option("embed_dim", EMBED_DIM, config)
     config = _replace_deprecated_option("num_neg", NUM_NEG, config)

From b6d2667f624da920a26aaf1038d6cb4e37a55871 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 12:33:10 +0100
Subject: [PATCH 479/633] rename constant

---
 rasa/core/policies/embedding_policy.py  |  4 ++--
 rasa/core/policies/ted_policy.py        | 10 +++++-----
 rasa/nlu/classifiers/diet_classifier.py |  8 ++++----
 rasa/nlu/selectors/diet_selector.py     |  4 ++--
 rasa/nlu/selectors/response_selector.py |  4 ++--
 rasa/utils/tensorflow/constants.py      |  3 ++-
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index ee2828f6c60a..a9692a73a417 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -31,7 +31,7 @@
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     DROPRATE_ATTENTION,
-    WEIGHTS_SPARSITY,
+    WEIGHT_SPARSITY,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -105,7 +105,7 @@ class EmbeddingPolicy(TEDPolicy):
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
         # sparsity of the weights in dense layers
-        WEIGHTS_SPARSITY: 0.8,
+        WEIGHT_SPARSITY: 0.8,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 947bcf0ca149..959621ad7ea6 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -51,7 +51,7 @@
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     DROPRATE_ATTENTION,
-    WEIGHTS_SPARSITY,
+    WEIGHT_SPARSITY,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -132,7 +132,7 @@ class TEDPolicy(Policy):
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
         # sparsity of the weights in dense layers
-        WEIGHTS_SPARSITY: 0.8,
+        WEIGHT_SPARSITY: 0.8,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -514,14 +514,14 @@ def _prepare_layers(self) -> None:
             self.config[HIDDEN_LAYERS_SIZES][DIALOGUE],
             self.config[DROPRATE_DIALOGUE],
             self.config[REGULARIZATION_CONSTANT],
-            self.config[WEIGHTS_SPARSITY],
+            self.config[WEIGHT_SPARSITY],
             layer_name_suffix=DIALOGUE,
         )
         self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][LABEL],
             self.config[DROPRATE_LABEL],
             self.config[REGULARIZATION_CONSTANT],
-            self.config[WEIGHTS_SPARSITY],
+            self.config[WEIGHT_SPARSITY],
             layer_name_suffix=LABEL,
         )
         self._tf_layers["transformer"] = TransformerEncoder(
@@ -532,7 +532,7 @@ def _prepare_layers(self) -> None:
             self.config[REGULARIZATION_CONSTANT],
             dropout_rate=self.config[DROPRATE_DIALOGUE],
             attention_dropout_rate=self.config[DROPRATE_ATTENTION],
-            sparsity=self.config[WEIGHTS_SPARSITY],
+            sparsity=self.config[WEIGHT_SPARSITY],
             unidirectional=True,
             use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
             use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 1f720e07a22d..6107ff955153 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -60,7 +60,7 @@
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
     DROPRATE_ATTENTION,
-    WEIGHTS_SPARSITY,
+    WEIGHT_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -170,7 +170,7 @@ class DIETClassifier(EntityExtractor):
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
         # sparsity of the weights in dense layers
-        WEIGHTS_SPARSITY: 0.8,
+        WEIGHT_SPARSITY: 0.8,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -1049,7 +1049,7 @@ def _prepare_input_layers(self, name: Text) -> None:
             self.config[HIDDEN_LAYERS_SIZES][name],
             self.config[DROPRATE],
             self.config[REGULARIZATION_CONSTANT],
-            self.config[WEIGHTS_SPARSITY],
+            self.config[WEIGHT_SPARSITY],
             name,
         )
 
@@ -1065,7 +1065,7 @@ def _prepare_sequence_layers(self, name: Text) -> None:
                 self.config[REGULARIZATION_CONSTANT],
                 dropout_rate=self.config[DROPRATE],
                 attention_dropout_rate=self.config[DROPRATE_ATTENTION],
-                sparsity=self.config[WEIGHTS_SPARSITY],
+                sparsity=self.config[WEIGHT_SPARSITY],
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
                 use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
                 use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
index f243dc26d04a..1901b0b2fa41 100644
--- a/rasa/nlu/selectors/diet_selector.py
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -34,7 +34,7 @@
     UNIDIRECTIONAL_ENCODER,
     DROPRATE,
     DROPRATE_ATTENTION,
-    WEIGHTS_SPARSITY,
+    WEIGHT_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -150,7 +150,7 @@ class DIETSelector(DIETClassifier):
         # dropout rate for attention
         DROPRATE_ATTENTION: 0,
         # sparsity of the weights in dense layers
-        WEIGHTS_SPARSITY: 0.8,
+        WEIGHT_SPARSITY: 0.8,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index f11ededeec92..2facc7ba03c6 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -27,7 +27,7 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     DROPRATE,
-    WEIGHTS_SPARSITY,
+    WEIGHT_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -134,7 +134,7 @@ class ResponseSelector(DIETSelector):
         # dropout rate for rnn
         DROPRATE: 0.2,
         # sparsity of the weights in dense layers
-        WEIGHTS_SPARSITY: 0.8,
+        WEIGHT_SPARSITY: 0.8,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index f49f6dc6bcf4..0904ec4dfdad 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -35,7 +35,8 @@
 DROPRATE_ATTENTION = "droprate_attention"
 DROPRATE_DIALOGUE = "droprate_dialogue"
 DROPRATE_LABEL = "droprate_label"
-WEIGHTS_SPARSITY = "weights_sparsity"
+
+WEIGHT_SPARSITY = "weight_sparsity"
 
 EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
 EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"

From b74c7d12f681c324cc8644b57e8fe8d301df7f1a Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 12:37:35 +0100
Subject: [PATCH 480/633] fix duplicate link

---
 docs/nlu/components.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 9b3da64f794f..e64d7bc9832c 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -248,7 +248,7 @@ SpacyTokenizer
 ConveRTTokenizer
 ~~~~~~~~~~~~~~~~
 
-:Short: Tokenizer using `ConveRT <https://github.com/PolyAI-LDN/polyai-models#convert>`_
+:Short: Tokenizer using `ConveRT <https://github.com/PolyAI-LDN/polyai-models#convert>`__ model.
 :Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
 :Requires: Nothing
 :Description:
@@ -374,7 +374,7 @@ ConveRTFeaturizer
 
 :Short:
     Creates a vector representation of user message and response (if specified) using
-    `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model.
+    `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`__ model.
 :Outputs: ``dense_features`` for texts and responses
 :Requires: :ref:`ConveRTTokenizer`
 :Type: Dense featurizer

From e2caee020c0bd04a0be6099745a72727aafb4e75 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 12:48:55 +0100
Subject: [PATCH 481/633] use self.epochs to set current epoch

---
 rasa/core/policies/keras_policy.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 20ba85885db6..a6497f6c5ce0 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -197,8 +197,7 @@ def train(
             verbose=obtain_verbosity(),
             **self._train_params,
         )
-        # the default parameter for epochs in keras fit is 1
-        self.current_epoch = self.defaults.get("epochs", 1)
+        self.current_epoch = self.epochs
         logger.info("Done fitting keras policy model")
 
     def continue_training(

From ccca6ac24c5e57248ee1ee7a7823ef9b4399dd7b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 13:34:59 +0100
Subject: [PATCH 482/633] update links in changelogs

---
 changelog/5065.feature.rst     | 2 +-
 changelog/5187.feature.rst     | 2 +-
 changelog/5230.feature.rst     | 2 +-
 changelog/5266.feature.rst     | 5 ++---
 changelog/5266.improvement.rst | 2 +-
 changelog/5266.misc.rst        | 2 +-
 changelog/663.feature.rst      | 1 +
 changelog/699.misc.rst         | 9 +++------
 8 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/changelog/5065.feature.rst b/changelog/5065.feature.rst
index 1f2ea11be93b..4360dee5c431 100644
--- a/changelog/5065.feature.rst
+++ b/changelog/5065.feature.rst
@@ -1,4 +1,4 @@
-Add ``LexicalSyntacticFeaturizer`` to sparse featurizers.
+Add :ref:`LexicalSyntacticFeaturizer` to sparse featurizers.
 
 ``LexicalSyntacticFeaturizer`` does the same featurization as the ``CRFEntityExtractor``. We extracted the
 featurization into a separate component so that the features can be reused and featurization is independent from the
diff --git a/changelog/5187.feature.rst b/changelog/5187.feature.rst
index 8822d42feecc..0ddcb3596438 100644
--- a/changelog/5187.feature.rst
+++ b/changelog/5187.feature.rst
@@ -1,6 +1,6 @@
 Integrate language models from HuggingFace's `Transformers <https://github.com/huggingface/transformers>`_ Library.
 
-Add a new NLP component `HFTransformersNLP <https://rasa.com/docs/rasa/nlu/components.html#hftransformersnlp>`_ which
+Add a new NLP component :ref:`HFTransformersNLP <HFTransformersNLP>` which
 tokenizes and featurizes incoming messages using a specified pre-trained model with the Transformers library as the backend.
 Add ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from ``HFTransformersNLP``
 and sets them correctly for message object.
diff --git a/changelog/5230.feature.rst b/changelog/5230.feature.rst
index 89f2f92941aa..8b1c3769fcc6 100644
--- a/changelog/5230.feature.rst
+++ b/changelog/5230.feature.rst
@@ -1,6 +1,6 @@
 Refactor how GPU and CPU environments are configured for TensorFlow 2.0.
 
-Please refer to the `documentation <https://rasa.com/docs/rasa/api/tensorflow_usage.html>`_ to understand
+Please refer to the :ref:`documentation <tensorflow_usage>` to understand
 which environment variables to set in what scenarios. A couple of examples are shown below as well:
 
 .. code-block:: python
diff --git a/changelog/5266.feature.rst b/changelog/5266.feature.rst
index 3ad91215a146..f9be522ddeab 100644
--- a/changelog/5266.feature.rst
+++ b/changelog/5266.feature.rst
@@ -1,11 +1,10 @@
 Added a new NLU component ``DIETClassifier`` and a new policy ``TEDPolicy``.
 
 DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
-recognition. You can read more about this component in our
-`documentation <https://rasa.com/docs/rasa/nlu/components/#diet-classifier>`_.
+recognition. You can read more about this component in our :ref:`documentation <diet-classifier>`.
 The new component will replace the ``EmbeddingIntentClassifier`` and the ``CRFEntityExtractor`` in the future.
 Those two components are deprecated from now on.
-See `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for details on how to
+See :ref:`migration guide <migration-to-rasa-1.8>` for details on how to
 switch to the new component.
 
 ``TEDPolicy`` is the new name for ``EmbeddingPolicy``. ``EmbeddingPolicy`` is deprecated from now on.
diff --git a/changelog/5266.improvement.rst b/changelog/5266.improvement.rst
index 1d0a19321893..bff559283a77 100644
--- a/changelog/5266.improvement.rst
+++ b/changelog/5266.improvement.rst
@@ -1 +1 @@
-We updated our code to Tensorflow 2.0.
+We updated our code to TensorFlow 2.
diff --git a/changelog/5266.misc.rst b/changelog/5266.misc.rst
index 19da19d66e03..0fc0fd323290 100644
--- a/changelog/5266.misc.rst
+++ b/changelog/5266.misc.rst
@@ -1,7 +1,7 @@
 We deprecated all existing pipeline templates, ``SklearnIntentClassifier`` and ``KerasPolicy``.
 
 Please list the components you want to use directly in your configuration file.
-Check out `Choosing a Pipeline <https://rasa.com/docs/rasa/nlu/choosing-a-pipeline/>`_ to decide what components to
+Check out :ref:`Choosing a Pipeline <choosing-a-pipeline>` to decide what components to
 include in your pipeline.
 
 Use ``DIETClassifier`` instead of ``SklearnIntentClassifier``.
diff --git a/changelog/663.feature.rst b/changelog/663.feature.rst
index 064d2e25398f..9d218cf1280e 100644
--- a/changelog/663.feature.rst
+++ b/changelog/663.feature.rst
@@ -3,3 +3,4 @@ The sentence vector of the ``SpacyFeaturizer`` and ``MitieFeaturizer`` can be ca
 To specify the pooling operation, set the option ``pooling`` for the ``SpacyFeaturizer`` or the ``MitieFeaturizer``
 in your configuration file. The default pooling operation is ``mean``. The mean pooling operation also does not take
 into account words, that do not have a word vector.
+See our :ref:`documentation <components>` for more details.
diff --git a/changelog/699.misc.rst b/changelog/699.misc.rst
index 2fe612da3afd..e834f0d25d54 100644
--- a/changelog/699.misc.rst
+++ b/changelog/699.misc.rst
@@ -1,6 +1,3 @@
-The `TEDPolicy <https://rasa.com/docs/rasa/core/policies/#ted-policy>`_
-replaces the ``KerasPolicy`` in new Rasa projects generated with ``rasa init``.
-The `TEDPolicy <https://rasa.com/docs/rasa/core/policies/#ted-policy>`_
-is now the recommended machine learning policy. Please see the
-`migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_
-if you want to switch to this new policy in an existing project.
+The :ref:`TEDPolicy <ted_policy>` replaces the ``KerasPolicy`` in new Rasa projects generated with ``rasa init``.
+The :ref:`TEDPolicy <ted_policy>` is now the recommended machine learning policy. Please see the
+:ref:`migration guide <migration-to-rasa-1.8>` if you want to switch to this new policy in an existing project.

From ccf378921bfbc6330eca6dff92b88bd2c95c4000 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 13:41:55 +0100
Subject: [PATCH 483/633] remove diet selector

---
 data/configs_for_docs/default_config.yml      |   2 +-
 .../default_english_config.yml                |   2 +-
 .../config_pretrained_embeddings_convert.yml  |   2 +-
 .../config_pretrained_embeddings_spacy_de.yml |   2 +-
 .../config_pretrained_embeddings_spacy_en.yml |   2 +-
 .../config_supervised_embeddings_duckling.yml |   2 +-
 docs/migration-guide.rst                      |  18 -
 docs/nlu/components.rst                       | 121 +----
 rasa/cli/initial_project/config.yml           |   2 +-
 rasa/nlu/registry.py                          |   2 -
 rasa/nlu/selectors/diet_selector.py           | 427 ------------------
 rasa/nlu/selectors/response_selector.py       | 213 ++++++++-
 tests/nlu/selectors/test_selectors.py         |   2 +-
 tests/nlu/test_evaluation.py                  |   2 +-
 tests/nlu/test_train.py                       |   6 -
 tests/utilities.py                            |   2 -
 16 files changed, 207 insertions(+), 600 deletions(-)
 delete mode 100644 rasa/nlu/selectors/diet_selector.py

diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
index 0f3cdbf5e593..46b75c8078c7 100644
--- a/data/configs_for_docs/default_config.yml
+++ b/data/configs_for_docs/default_config.yml
@@ -11,4 +11,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: ResponseSelector
\ No newline at end of file
diff --git a/data/configs_for_docs/default_english_config.yml b/data/configs_for_docs/default_english_config.yml
index 1f264cc796ed..366e2bc9aac9 100644
--- a/data/configs_for_docs/default_english_config.yml
+++ b/data/configs_for_docs/default_english_config.yml
@@ -12,4 +12,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: ResponseSelector
\ No newline at end of file
diff --git a/docker/configs/config_pretrained_embeddings_convert.yml b/docker/configs/config_pretrained_embeddings_convert.yml
index c059db91d4db..ee0da9bfab1d 100644
--- a/docker/configs/config_pretrained_embeddings_convert.yml
+++ b/docker/configs/config_pretrained_embeddings_convert.yml
@@ -12,4 +12,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_de.yml b/docker/configs/config_pretrained_embeddings_spacy_de.yml
index 5a5c1b64d985..c5068fe6377e 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_de.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_de.yml
@@ -13,4 +13,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_en.yml b/docker/configs/config_pretrained_embeddings_spacy_en.yml
index 697a4d9eae09..b6591e42bc97 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_en.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_en.yml
@@ -13,4 +13,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
diff --git a/docker/configs/config_supervised_embeddings_duckling.yml b/docker/configs/config_supervised_embeddings_duckling.yml
index 0637f14f706b..7dbecba7acb9 100644
--- a/docker/configs/config_supervised_embeddings_duckling.yml
+++ b/docker/configs/config_supervised_embeddings_duckling.yml
@@ -11,6 +11,6 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
   - name: DucklingHTTPExtractor
     url: "http://duckling:8000"
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index c66a07ef272f..d9bf13cd549b 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -131,24 +131,6 @@ General
   ``DIETClassifier``. For more information about the ``DIETClassifier`` and the ``LexicalSyntacticFeaturizer``
   see :ref:`components`.
 
-- ``ResponseSelector`` is now deprecated and will be replaced by ``DIETSelector`` in the future. If you want to
-  get the same model behaviour as the current ``ResponseSelector``, you can use the following configuration of
-  ``DIETSelector``:
-
-  .. code-block:: yaml
-
-    pipeline:
-    # - ... other components
-    - name: DIETSelector
-      intent_classification: True
-      entity_recognition: False
-      use_masked_language_model: False
-      BILOU_flag: False
-      number_of_transformer_layers: 0
-      # ... any other parameters
-
-  See :ref:`diet-selector` for more information about the new component.
-
 .. _migration-to-rasa-1.7:
 
 Rasa 1.6 to Rasa 1.7
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c325eb052e5f..a4cc2059dc4b 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -967,121 +967,6 @@ ResponseSelector
     Response Selector component can be used to build a response retrieval model to directly predict a bot response from
     a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
     It embeds user inputs and response labels into the same space and follows the exact same
-    neural network architecture and optimization as the ``EmbeddingIntentClassifier``.
-
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty response ``None`` is predicted with confidence ``0.0``.
-
-    .. warning::
-        ``ResponseSelector`` is deprecated and should be replaced by ``DIETSelector``. See
-        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
-
-:Configuration:
-
-    The algorithm includes all the hyperparameters that ``EmbeddingIntentClassifier`` uses.
-    In addition, the component can also be configured to train a response selector for a particular retrieval intent.
-
-        - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
-
-    Default values:
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "ResponseSelector"
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [], "label": []}
-            # Whether to share the hidden layer weights between input words and labels
-            "share_hidden_layers": False
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
-            "batch_size": [64, 256]
-            # how to create batches
-            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-            # number of epochs
-            "epochs": 300
-            # set random seed to any int to get reproducible results
-            "random_seed": None
-            # optimizer
-            "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512, "label": 512}
-            # dimension size of embedding vectors
-            "embedding_dimension": 20
-            # the type of the similarity
-            "number_of_negative_examples": 20
-            # flag if minimize only maximum similarity over incorrect actions
-            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-            # the type of the loss function
-            "loss_type": "softmax"  # string 'softmax' or 'margin'
-            # number of top intents to normalize scores for softmax loss_type
-            # set to 0 to turn off normalization
-            "ranking_length": 10
-            # how similar the algorithm should try
-            # to make embedding vectors for correct labels
-            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-            # maximum negative similarity for incorrect labels
-            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
-            # flag: if true, only minimize the maximum similarity for incorrect labels
-            "use_maximum_negative_similarity": True
-            # scale loss inverse proportionally to confidence of correct prediction
-            "scale_loss": True
-            # regularization parameters
-            # the scale of regularization
-            "regularization_constant": 0.002
-            # the scale of how critical the algorithm should be of minimizing the
-            # maximum similarity between embeddings of different labels
-            "negative_margin_scale": 0.8
-            # dropout rate for rnn
-            "droprate": 0.2
-            # if true apply dropout to sparse tensors
-            "use_sparse_input_dropout": True
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
-            # selector config
-            # name of the intent for which this response selector is to be trained
-            "retrieval_intent": None
-
-
-.. _diet-selector:
-
-DIETSelector
-~~~~~~~~~~~~~~~~
-
-:Short: DIET Selector
-:Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user message and response
-
-:Output-Example:
-
-    .. code-block:: json
-
-        {
-            "response_selector": {
-              "faq": {
-                "response": {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
-                "ranking": [
-                    {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
-                    {"confidence": 0.2134543431, "name": "You can ask me about how to get started"}
-                ]
-              }
-            }
-        }
-
-:Description:
-
-    DIET Selector component can be used to build a response retrieval model to directly predict a bot response from
-    a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
-    It embeds user inputs and response labels into the same space and follows the exact same
     neural network architecture and optimization as the ``DIETClassifier``.
 
     .. note:: If during prediction time a message contains **only** words unseen during training,
@@ -1105,13 +990,13 @@ DIETSelector
             # sizes of hidden layers before the embedding layer
             # for input words and intent labels,
             # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [], "label": []}
+            "hidden_layers_sizes": {"text": [256, 128], "label": [256, 128]}
             # Whether to share the hidden layer weights between input words and labels
             "share_hidden_layers": False
             # number of units in transformer
-            "transformer_size": 256
+            "transformer_size": None
             # number of transformer layers
-            "number_of_transformer_layers": 2
+            "number_of_transformer_layers": 0
             # number of attention heads in transformer
             "number_of_attention_heads": 4
             # max sequence length
diff --git a/rasa/cli/initial_project/config.yml b/rasa/cli/initial_project/config.yml
index f1088d9a442a..dedb0b714890 100644
--- a/rasa/cli/initial_project/config.yml
+++ b/rasa/cli/initial_project/config.yml
@@ -12,7 +12,7 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
 
 # Configuration for Rasa Core.
 # https://rasa.com/docs/rasa/core/policies/
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index f9b3a2217f4d..f00cd71e823f 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -33,7 +33,6 @@
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.selectors.response_selector import ResponseSelector
-from rasa.nlu.selectors.diet_selector import DIETSelector
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
@@ -93,7 +92,6 @@
     EmbeddingIntentClassifier,
     # selectors
     ResponseSelector,
-    DIETSelector,
 ]
 
 # Mapping from a components name to its class to allow name based lookup.
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
deleted file mode 100644
index a6e691a7c8cb..000000000000
--- a/rasa/nlu/selectors/diet_selector.py
+++ /dev/null
@@ -1,427 +0,0 @@
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from typing import Any, Dict, Optional, Text, Tuple, Union
-
-from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
-from rasa.nlu.components import any_of
-from rasa.utils.tensorflow.constants import (
-    LABEL,
-    HIDDEN_LAYERS_SIZES,
-    SHARE_HIDDEN_LAYERS,
-    TRANSFORMER_SIZE,
-    NUM_TRANSFORMER_LAYERS,
-    NUM_HEADS,
-    MAX_SEQ_LENGTH,
-    BATCH_SIZES,
-    BATCH_STRATEGY,
-    EPOCHS,
-    RANDOM_SEED,
-    LEARNING_RATE,
-    DENSE_DIM,
-    RANKING_LENGTH,
-    LOSS_TYPE,
-    SIMILARITY_TYPE,
-    NUM_NEG,
-    SPARSE_INPUT_DROPOUT,
-    MASKED_LM,
-    ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
-    EVAL_NUM_EXAMPLES,
-    EVAL_NUM_EPOCHS,
-    UNIDIRECTIONAL_ENCODER,
-    DROPRATE,
-    DROPRATE_ATTENTION,
-    NEG_MARGIN_SCALE,
-    REGULARIZATION_CONSTANT,
-    SCALE_LOSS,
-    EMBED_DIM,
-    BILOU_FLAG,
-    KEY_RELATIVE_ATTENTION,
-    VALUE_RELATIVE_ATTENTION,
-    MAX_RELATIVE_POSITION,
-    USE_MAX_NEG_SIM,
-    MAX_NEG_SIM,
-    MAX_POS_SIM,
-)
-from rasa.nlu.constants import (
-    RESPONSE,
-    RESPONSE_SELECTOR_PROPERTY_NAME,
-    DEFAULT_OPEN_UTTERANCE_TYPE,
-    DENSE_FEATURE_NAMES,
-    TEXT,
-    SPARSE_FEATURE_NAMES,
-)
-from rasa.utils.tensorflow.model_data import RasaModelData
-from rasa.utils.tensorflow.models import RasaModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class DIETSelector(DIETClassifier):
-    """Response selector using supervised embeddings.
-
-    The response selector embeds user inputs
-    and candidate response into the same space.
-    Supervised embeddings are trained by maximizing similarity between them.
-    It also provides rankings of the response that did not "win".
-
-    The supervised response selector needs to be preceded by
-    a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that
-    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
-    However, in this implementation the `mu` parameter is treated differently
-    and additional hidden layers are added together with dropout.
-    """
-
-    provides = [RESPONSE, "response_ranking"]
-
-    requires = [
-        any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT]),
-        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
-    ]
-
-    # default properties (DOC MARKER - don't remove)
-    defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer
-        # for input words and responses
-        # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
-        # Whether to share the hidden layer weights between input words and intent labels
-        SHARE_HIDDEN_LAYERS: False,
-        # number of units in transformer
-        TRANSFORMER_SIZE: None,
-        # number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 0,
-        # number of attention heads in transformer
-        NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
-        BATCH_SIZES: [64, 256],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
-        EPOCHS: 300,
-        # set random seed to any int to get reproducible results
-        RANDOM_SEED: None,
-        # optimizer
-        LEARNING_RATE: 0.001,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512, LABEL: 512},
-        # dimension size of embedding vectors
-        EMBED_DIM: 20,
-        # the type of the similarity
-        NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
-        # number of top responses to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
-        RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct intent labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect intent labels
-        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for
-        # incorrect intent labels
-        USE_MAX_NEG_SIM: True,
-        # scale loss inverse proportionally to confidence of correct prediction
-        SCALE_LOSS: True,
-        # regularization parameters
-        # the scale of L2 regularization
-        REGULARIZATION_CONSTANT: 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different intent labels
-        NEG_MARGIN_SCALE: 0.8,
-        # dropout rate for rnn
-        DROPRATE: 0.2,
-        # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
-        # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
-        # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: False,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
-        # if true random tokens of the input message will be masked and the model
-        # should predict those tokens
-        MASKED_LM: False,
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
-        # selector config
-        # name of the intent for which this response selector is to be trained
-        "retrieval_intent": None,
-    }
-    # end default properties (DOC MARKER - don't remove)
-
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        model: Optional[RasaModel] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
-    ) -> None:
-
-        component_config = component_config or {}
-
-        # the following properties cannot be adapted for the ResponseSelector
-        component_config[INTENT_CLASSIFICATION] = True
-        component_config[ENTITY_RECOGNITION] = False
-        component_config[BILOU_FLAG] = False
-
-        super().__init__(
-            component_config,
-            inverted_label_dict,
-            inverted_tag_dict,
-            model,
-            batch_tuple_sizes,
-        )
-
-    @property
-    def label_key(self) -> Text:
-        return "label_ids"
-
-    @staticmethod
-    def model_class():
-        return DIET2DIET
-
-    def _load_selector_params(self, config: Dict[Text, Any]) -> None:
-        self.retrieval_intent = config["retrieval_intent"]
-        if not self.retrieval_intent:
-            # retrieval intent was left to its default value
-            logger.info(
-                "Retrieval intent parameter was left to its default value. This "
-                "response selector will be trained on training examples combining "
-                "all retrieval intents."
-            )
-
-    def _check_config_parameters(self) -> None:
-        super()._check_config_parameters()
-        self._load_selector_params(self.component_config)
-
-    @staticmethod
-    def _set_message_property(
-        message: Message, prediction_dict: Dict[Text, Any], selector_key: Text
-    ) -> None:
-
-        message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
-        message_selector_properties[selector_key] = prediction_dict
-        message.set(
-            RESPONSE_SELECTOR_PROPERTY_NAME,
-            message_selector_properties,
-            add_to_output=True,
-        )
-
-    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
-        """Performs sanity checks on training data, extracts encodings for labels
-        and prepares data for training"""
-        if self.retrieval_intent:
-            training_data = training_data.filter_by_intent(self.retrieval_intent)
-
-        label_id_dict = self._create_label_id_dict(training_data, attribute=RESPONSE)
-        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-
-        self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=RESPONSE
-        )
-
-        model_data = self._create_model_data(
-            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE
-        )
-
-        self.check_input_dimension_consistency(model_data)
-
-        return model_data
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-        """Return the most likely response and its similarity to the input."""
-
-        out = self._predict(message)
-        label, label_ranking = self._predict_label(out)
-
-        selector_key = (
-            self.retrieval_intent
-            if self.retrieval_intent
-            else DEFAULT_OPEN_UTTERANCE_TYPE
-        )
-
-        logger.debug(
-            f"Adding following selector key to message property: {selector_key}"
-        )
-
-        prediction_dict = {"response": label, "ranking": label_ranking}
-
-        self._set_message_property(message, prediction_dict, selector_key)
-
-
-class DIET2DIET(DIET):
-    def _check_data(self) -> None:
-        if "text_features" not in self.data_signature:
-            raise ValueError(
-                f"No text features specified. "
-                f"Cannot train '{self.__class__.__name__}' model."
-            )
-        if "label_features" not in self.data_signature:
-            raise ValueError(
-                f"No label features specified. "
-                f"Cannot train '{self.__class__.__name__}' model."
-            )
-        if (
-            self.config[SHARE_HIDDEN_LAYERS]
-            and self.data_signature["text_features"]
-            != self.data_signature["label_features"]
-        ):
-            raise ValueError(
-                "If hidden layer weights are shared, data signatures "
-                "for text_features and label_features must coincide."
-            )
-
-    def _create_metrics(self) -> None:
-        # self.metrics preserve order
-        # output losses first
-        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
-        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
-        # output accuracies second
-        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
-        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
-
-    def _update_metrics_to_log(self) -> None:
-        if self.config[MASKED_LM]:
-            self.metrics_to_log += ["m_loss", "m_acc"]
-
-        self.metrics_to_log += ["r_loss", "r_acc"]
-
-    def _prepare_layers(self) -> None:
-        self.text_name = TEXT
-        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
-
-        self._prepare_sequence_layers(self.text_name)
-        self._prepare_sequence_layers(self.label_name)
-        if self.config[MASKED_LM]:
-            self._prepare_mask_lm_layers(self.text_name)
-        self._prepare_label_classification_layers()
-
-    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_label_ids = self.tf_label_data["label_ids"][0]
-
-        mask_label = self.tf_label_data["label_mask"][0]
-        sequence_lengths_label = self._get_sequence_lengths(mask_label)
-
-        label_transformed, _, _, _ = self._create_sequence(
-            self.tf_label_data["label_features"], mask_label, self.label_name
-        )
-        cls_label = self._last_token(label_transformed, sequence_lengths_label)
-
-        all_labels_embed = self._tf_layers["embed.label"](cls_label)
-
-        return all_label_ids, all_labels_embed
-
-    def batch_loss(
-        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
-    ) -> tf.Tensor:
-        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths_text = self._get_sequence_lengths(mask_text)
-
-        (
-            text_transformed,
-            text_in,
-            text_seq_ids,
-            lm_mask_bool_text,
-        ) = self._create_sequence(
-            tf_batch_data["text_features"],
-            mask_text,
-            self.text_name,
-            self.config[MASKED_LM],
-            sequence_ids=True,
-        )
-
-        mask_label = tf_batch_data["label_mask"][0]
-        sequence_lengths_label = self._get_sequence_lengths(mask_label)
-
-        label_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["label_features"], mask_label, self.label_name
-        )
-
-        losses = []
-
-        if self.config[MASKED_LM]:
-            loss, acc = self._mask_loss(
-                text_transformed,
-                text_in,
-                text_seq_ids,
-                lm_mask_bool_text,
-                self.text_name,
-            )
-
-            self.mask_loss.update_state(loss)
-            self.mask_acc.update_state(acc)
-            losses.append(loss)
-
-        # get _cls_ vector for label classification
-        cls_text = self._last_token(text_transformed, sequence_lengths_text)
-        cls_label = self._last_token(label_transformed, sequence_lengths_label)
-        label_ids = tf_batch_data["label_ids"][0]
-
-        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
-        self.response_loss.update_state(loss)
-        self.response_acc.update_state(acc)
-        losses.append(loss)
-
-        return tf.math.add_n(losses)
-
-    def batch_predict(
-        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
-    ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = self.batch_to_model_data_format(
-            batch_in, self.predict_data_signature
-        )
-
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths_text = self._get_sequence_lengths(mask_text)
-
-        text_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, self.text_name
-        )
-
-        out = {}
-
-        if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels()
-
-        # get _cls_ vector for intent classification
-        cls = self._last_token(text_transformed, sequence_lengths_text)
-        cls_embed = self._tf_layers["embed.text"](cls)
-
-        sim_all = self._tf_layers["loss.label"].sim(
-            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
-        )
-        scores = self._tf_layers["loss.label"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
-        out["i_scores"] = scores
-
-        return out
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 5df777521d79..3d9c99bace1b 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -1,15 +1,21 @@
 import logging
 
-from typing import Any, Dict, Optional, Text
+import numpy as np
+import tensorflow as tf
+
+from typing import Any, Dict, Optional, Text, Tuple, Union
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -26,15 +32,20 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
     DROPRATE,
+    DROPRATE_ATTENTION,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
+    EMBED_DIM,
+    BILOU_FLAG,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
-    BILOU_FLAG,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -46,8 +57,7 @@
 )
 from rasa.utils.tensorflow.model_data import RasaModelData
 from rasa.utils.tensorflow.models import RasaModel
-from rasa.utils.common import raise_warning
-from rasa.constants import DOCS_URL_COMPONENTS
+
 
 logger = logging.getLogger(__name__)
 
@@ -78,16 +88,23 @@ class ResponseSelector(DIETClassifier):
         any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
     ]
 
-    # please make sure to update the docs when changing a default parameter
+    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
         # for input words and responses
         # the number of hidden layers is thus equal to the length of this list
         HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
-        # Whether to share the hidden layer weights between input words and intent
-        # labels
+        # Whether to share the hidden layer weights between input words and intent labels
         SHARE_HIDDEN_LAYERS: False,
+        # number of units in transformer
+        TRANSFORMER_SIZE: None,
+        # number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 0,
+        # number of attention heads in transformer
+        NUM_HEADS: 4,
+        # max sequence length if pos_encoding='emb'
+        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -132,6 +149,10 @@ class ResponseSelector(DIETClassifier):
         NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
+        # use a unidirectional or bidirectional encoder
+        UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
@@ -139,10 +160,20 @@ class ResponseSelector(DIETClassifier):
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
+        # if true random tokens of the input message will be masked and the model
+        # should predict those tokens
+        MASKED_LM: False,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
     }
+    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
@@ -159,8 +190,6 @@ def __init__(
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[BILOU_FLAG] = False
-        component_config[MASKED_LM] = False
-        component_config[NUM_TRANSFORMER_LAYERS] = 0
 
         super().__init__(
             component_config,
@@ -170,17 +199,14 @@ def __init__(
             batch_tuple_sizes,
         )
 
-        raise_warning(
-            f"'ResponseSelector' is deprecated and will be removed in version 2.0. "
-            f"Use 'DIETSelector' instead.",
-            category=FutureWarning,
-            docs=DOCS_URL_COMPONENTS,
-        )
-
     @property
     def label_key(self) -> Text:
         return "label_ids"
 
+    @staticmethod
+    def model_class():
+        return DIET2DIET
+
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
         if not self.retrieval_intent:
@@ -248,3 +274,154 @@ def process(self, message: Message, **kwargs: Any) -> None:
         prediction_dict = {"response": label, "ranking": label_ranking}
 
         self._set_message_property(message, prediction_dict, selector_key)
+
+
+class DIET2DIET(DIET):
+    def _check_data(self) -> None:
+        if "text_features" not in self.data_signature:
+            raise ValueError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if "label_features" not in self.data_signature:
+            raise ValueError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if (
+            self.config[SHARE_HIDDEN_LAYERS]
+            and self.data_signature["text_features"]
+            != self.data_signature["label_features"]
+        ):
+            raise ValueError(
+                "If hidden layer weights are shared, data signatures "
+                "for text_features and label_features must coincide."
+            )
+
+    def _create_metrics(self) -> None:
+        # self.metrics preserve order
+        # output losses first
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
+        # output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
+
+    def _update_metrics_to_log(self) -> None:
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+
+        self.metrics_to_log += ["r_loss", "r_acc"]
+
+    def _prepare_layers(self) -> None:
+        self.text_name = TEXT
+        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
+
+        self._prepare_sequence_layers(self.text_name)
+        self._prepare_sequence_layers(self.label_name)
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers(self.text_name)
+        self._prepare_label_classification_layers()
+
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label_ids = self.tf_label_data["label_ids"][0]
+
+        mask_label = self.tf_label_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            self.tf_label_data["label_features"], mask_label, self.label_name
+        )
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+
+        all_labels_embed = self._tf_layers["embed.label"](cls_label)
+
+        return all_label_ids, all_labels_embed
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data["text_features"],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
+        )
+
+        mask_label = tf_batch_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["label_features"], mask_label, self.label_name
+        )
+
+        losses = []
+
+        if self.config[MASKED_LM]:
+            loss, acc = self._mask_loss(
+                text_transformed,
+                text_in,
+                text_seq_ids,
+                lm_mask_bool_text,
+                self.text_name,
+            )
+
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
+            losses.append(loss)
+
+        # get _cls_ vector for label classification
+        cls_text = self._last_token(text_transformed, sequence_lengths_text)
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+        label_ids = tf_batch_data["label_ids"][0]
+
+        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
+        self.response_loss.update_state(loss)
+        self.response_acc.update_state(acc)
+        losses.append(loss)
+
+        return tf.math.add_n(losses)
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
+        )
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, self.text_name
+        )
+
+        out = {}
+
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels()
+
+        # get _cls_ vector for intent classification
+        cls = self._last_token(text_transformed, sequence_lengths_text)
+        cls_embed = self._tf_layers["embed.text"](cls)
+
+        sim_all = self._tf_layers["loss.label"].sim(
+            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
+        )
+        scores = self._tf_layers["loss.label"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+        out["i_scores"] = scores
+
+        return out
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 02fd54764ba3..333ac185aaf7 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -17,7 +17,7 @@
         [
             {"name": "WhitespaceTokenizer"},
             {"name": "CountVectorsFeaturizer"},
-            {"name": "DIETSelector", EPOCHS: 2},
+            {"name": "ResponseSelector", EPOCHS: 2},
         ],
     ],
 )
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index f8ac3176d115..03e900d8d48d 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -317,7 +317,7 @@ def test_run_cv_evaluation_with_response_selector():
                 {"name": "WhitespaceTokenizer"},
                 {"name": "CountVectorsFeaturizer"},
                 {"name": "DIETClassifier", EPOCHS: 2},
-                {"name": "DIETSelector", EPOCHS: 2},
+                {"name": "ResponseSelector", EPOCHS: 2},
             ],
         }
     )
diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index 9037d0c7971d..bdd0d196ecba 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -41,7 +41,6 @@ def pipelines_for_tests():
                 "DIETClassifier",
                 "EmbeddingIntentClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -60,7 +59,6 @@ def pipelines_for_tests():
                 "SklearnIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -77,7 +75,6 @@ def pipelines_for_tests():
                 "DucklingHTTPExtractor",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -93,7 +90,6 @@ def pipelines_for_tests():
                 "DucklingHTTPExtractor",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -110,7 +106,6 @@ def pipelines_for_tests():
                 "MitieIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -126,7 +121,6 @@ def pipelines_for_tests():
                 "MitieIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
diff --git a/tests/utilities.py b/tests/utilities.py
index f776052ac9be..70637933b192 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -4,7 +4,6 @@
 import rasa.utils.io as io_utils
 from nlu.classifiers.diet_classifier import DIETClassifier
 from nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from nlu.selectors.diet_selector import DIETSelector
 from nlu.selectors.response_selector import ResponseSelector
 from utils.tensorflow.constants import EPOCHS
 
@@ -32,7 +31,6 @@ def update_number_of_epochs(config_path: Text, output_file: Text):
             EmbeddingIntentClassifier.name,
             DIETClassifier.name,
             ResponseSelector.name,
-            DIETSelector.name,
         ]:
             component[EPOCHS] = 2
 

From 2444e01951b7d21ca2f25c9330deac60d6db889e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 13:51:33 +0100
Subject: [PATCH 484/633] add changelog for removed mitie docker image

---
 changelog/5266.removal.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 changelog/5266.removal.rst

diff --git a/changelog/5266.removal.rst b/changelog/5266.removal.rst
new file mode 100644
index 000000000000..b95e2242d3af
--- /dev/null
+++ b/changelog/5266.removal.rst
@@ -0,0 +1,21 @@
+We replaced the MITIE Docker image with a Docker image that uses ConveRT.
+
+The new images uses the following configuration
+
+```
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
+```
\ No newline at end of file

From d2426f9f7f004d26eb7686ddbaa67e2322502b66 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 13:54:08 +0100
Subject: [PATCH 485/633] add weight sparsity to the docs

---
 docs/nlu/components.rst                           | 15 ++++++++++-----
 .../classifiers/embedding_intent_classifier.py    |  3 +++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c325eb052e5f..09f21ea9e4b3 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -819,6 +819,7 @@ EmbeddingIntentClassifier
               the maximum similarity between embeddings of different intent labels.
             - ``droprate`` sets the dropout rate, it should be
               between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
+            - ``weight_sparsity`` sets the sparsity of the weght kernels in dense layers.
             - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
     .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
@@ -890,6 +891,8 @@ EmbeddingIntentClassifier
             "negative_margin_scale": 0.8
             # dropout rate for rnn
             "droprate": 0.2
+            # sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
             # if true apply dropout to sparse tensors
             "use_sparse_input_dropout": False
             # visualization of accuracy
@@ -1040,6 +1043,8 @@ ResponseSelector
             "negative_margin_scale": 0.8
             # dropout rate for rnn
             "droprate": 0.2
+            # sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
             # if true apply dropout to sparse tensors
             "use_sparse_input_dropout": True
             # visualization of accuracy
@@ -1114,8 +1119,6 @@ DIETSelector
             "number_of_transformer_layers": 2
             # number of attention heads in transformer
             "number_of_attention_heads": 4
-            # max sequence length
-            "maximum_sequence_length": 256
             # use a unidirectional or bidirectional encoder
             "unidirectional_encoder": False
             # if true use key relative embeddings in attention
@@ -1169,6 +1172,8 @@ DIETSelector
             "droprate": 0.2
             # dropout rate for attention
             "droprate_attention": 0
+            # sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
             # if true apply dropout to sparse tensors
             "use_sparse_input_dropout": True
             # visualization of accuracy
@@ -1551,7 +1556,6 @@ DIETClassifier
             - ``transformer_size`` sets the size of the transformer.
             - ``number_of_transformer_layers`` sets the number of transformer layers to use.
             - ``number_of_attention_heads`` sets the number of attention heads to use.
-            - ``maximum_sequence_length`` sets the maximum length of sequence.
             - ``unidirectional_encoder`` specifies whether to use a unidirectional or bidirectional encoder.
             - ``use_key_relative_attention`` if true use key relative embeddings in attention.
             - ``use_value_relative_attention`` if true use key relative embeddings in attention.
@@ -1611,6 +1615,7 @@ DIETClassifier
               between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
             - ``droprate_attention`` sets the dropout rate for attention, it should be
               between ``0`` and ``1``, e.g. ``droprate_attention=0.1`` would drop out ``10%`` of input units.
+            - ``weight_sparsity`` sets the sparsity of weight kernels in dense layers.
             - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
         - model configuration:
@@ -1652,8 +1657,6 @@ DIETClassifier
             "number_of_transformer_layers": 2
             # number of attention heads in transformer
             "number_of_attention_heads": 4
-            # max sequence length
-            "maximum_sequence_length": 256
             # use a unidirectional or bidirectional encoder
             "unidirectional_encoder": False
             # if true use key relative embeddings in attention
@@ -1707,6 +1710,8 @@ DIETClassifier
             "droprate": 0.2
             # dropout rate for attention
             "droprate_attention": 0
+            # sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
             # if true apply dropout to sparse tensors
             "use_sparse_input_dropout": True
             # visualization of accuracy
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 90f9b5881c75..2f3148ed3e9b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -27,6 +27,7 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     DROPRATE,
+    WEIGHT_SPARSITY,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -100,6 +101,8 @@ class EmbeddingIntentClassifier(DIETClassifier):
         NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
+        # sparsity of the weights in dense layers
+        WEIGHT_SPARSITY: 0.8,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy

From c4871b4ec5bbfa1072dd31d0e1f40e086f73fca5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 13:55:42 +0100
Subject: [PATCH 486/633] remove doc markers

---
 rasa/core/policies/embedding_policy.py  | 2 --
 rasa/core/policies/ted_policy.py        | 2 --
 rasa/nlu/classifiers/diet_classifier.py | 2 --
 rasa/nlu/selectors/response_selector.py | 2 --
 4 files changed, 8 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index a500abca6f48..2a12f2f0a454 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -48,7 +48,6 @@ class EmbeddingPolicy(TEDPolicy):
     The policy used in our paper https://arxiv.org/abs/1910.00486
     """
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # a list of hidden layers sizes before dialogue and action embed layers
@@ -118,7 +117,6 @@ class EmbeddingPolicy(TEDPolicy):
         # max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 9d65e00901e7..95754a41d4b7 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -69,7 +69,6 @@ class TEDPolicy(Policy):
 
     SUPPORTS_ONLINE_TRAINING = True
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # a list of hidden layers sizes before dialogue and action embed layers
@@ -139,7 +138,6 @@ class TEDPolicy(Policy):
         # how many examples to use for hold out validation set
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
     }
-    # end default properties (DOC MARKER - don't remove)
 
     @staticmethod
     def _standard_featurizer(max_history: Optional[int] = None) -> TrackerFeaturizer:
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 935c8356f7ca..472f57a00e74 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -101,7 +101,6 @@ class DIETClassifier(EntityExtractor):
 
     requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
@@ -191,7 +190,6 @@ class DIETClassifier(EntityExtractor):
         # rule of thumb: use only if more than 100 egs. per entity
         BILOU_FLAG: True,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     # init helpers
     def _check_config_parameters(self) -> None:
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 3d9c99bace1b..6214800d7cbf 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -88,7 +88,6 @@ class ResponseSelector(DIETClassifier):
         any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
     ]
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
@@ -173,7 +172,6 @@ class ResponseSelector(DIETClassifier):
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,

From 672ea5bafc50de67f359ad8bea3a742f6252323d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:00:42 +0100
Subject: [PATCH 487/633] review comments on docs

---
 data/configs_for_docs/default_config.yml      | 23 ++++++++++++++++++-
 .../configs_for_docs/default_spacy_config.yml | 14 +++++++++++
 docs/core/policies.rst                        |  5 +---
 docs/nlu/choosing-a-pipeline.rst              | 20 ++++++++--------
 4 files changed, 47 insertions(+), 15 deletions(-)
 create mode 100644 data/configs_for_docs/default_spacy_config.yml

diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
index 0f3cdbf5e593..ce7cdc028443 100644
--- a/data/configs_for_docs/default_config.yml
+++ b/data/configs_for_docs/default_config.yml
@@ -1,9 +1,30 @@
 language: "en"
 
 pipeline:
-  - name: WhitespaceTokenizer
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
   - name: RegexFeaturizer
   - name: LexicalSyntacticFeaturizer
+    "features": [
+      ["low", "title", "upper"],
+      [
+        "BOS",
+        "EOS",
+        "low",
+        "prefix5",
+        "prefix2",
+        "suffix5",
+        "suffix3",
+        "suffix2",
+        "upper",
+        "title",
+        "digit",
+        "pos",
+        "pos2"
+      ],
+      ["low", "title", "upper"],
+    ]
   - name: CountVectorsFeaturizer
   - name: CountVectorsFeaturizer
     analyzer: "char_wb"
diff --git a/data/configs_for_docs/default_spacy_config.yml b/data/configs_for_docs/default_spacy_config.yml
new file mode 100644
index 000000000000..0f3cdbf5e593
--- /dev/null
+++ b/data/configs_for_docs/default_spacy_config.yml
@@ -0,0 +1,14 @@
+language: "en"
+
+pipeline:
+  - name: WhitespaceTokenizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: DIETSelector
\ No newline at end of file
diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 9393b862b923..86cf142380ce 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -202,15 +202,12 @@ Embedding Policy
         ``EmbeddingPolicy`` got renamed to ``TEDPolicy``. Please use :ref:`ted_policy` instead of ``EmbeddingPolicy``.
         The functionality of the policy stayed the same.
 
-
 .. _ted_policy:
 
 TED Policy
 ^^^^^^^^^^
 
-Transformer Embedding Dialogue (TED) Policy
-
-The policy used in our paper https://arxiv.org/abs/1910.00486.
+Transformer Embedding Dialogue (TED) Policy is described in our paper https://arxiv.org/abs/1910.00486.
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 174b3beb018d..e3b11d756e77 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -39,7 +39,7 @@ A Longer Answer
 We encourage everyone to define their own pipeline by listing the names of the components you want to use.
 For example:
 
-.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+.. literalinclude:: ../../data/configs_for_docs/default_spacy_config.yml
     :language: yaml
 
 You can find the details of each component in :ref:`components`.
@@ -55,7 +55,8 @@ Tokenization
 ~~~~~~~~~~~~
 If your chosen language is whitespace-tokenized (words are separated by spaces), you
 can use the ``WhitespaceTokenizer``. If this is not the case you should use a different tokenizer.
-We support a number of different :ref:`tokenizers <tokenizers>`, or you can :ref:`create your own <custom-nlu-components>`.
+We support a number of different :ref:`tokenizers <tokenizers>`, or you can
+:ref:`create your own <custom-nlu-components>`.
 
 .. note::
     Some components further down the pipeline may require a specific tokenizer. You can find those requirements
@@ -90,25 +91,24 @@ have a training example, like: "can I book a car?", and Rasa is asked to predict
 my place", since the contextual vector representation for both examples are already very similar, the intent classified
 for both is highly likely to be the same. This is also useful if you don't have large enough training data.
 
-An alternative to ``ConveRTFeaturizer`` can be ``LanguageModelFeaturizer`` which uses pre-trained language models such as
-BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See :ref:`HFTransformersNLP`
-for a full list of supported language models.
+An alternative to ``ConveRTFeaturizer`` can be ``LanguageModelFeaturizer`` which uses pre-trained language models such
+as BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See
+:ref:`HFTransformersNLP` for a full list of supported language models.
 
 In case, your training data is not in English you can also use a different variant of a language model which
 is pre-trained in the language specific to your training data. For example, there is a chinese language variant of
-BERT(``bert-base-chinese``) or a japanese variant of it(``bert-base-japanese``). A full list of different variants of these
-language models is available in the
-`official docs of Transformers library <https://huggingface.co/transformers/pretrained_models.html>_`
+BERT(``bert-base-chinese``) or a japanese variant of it(``bert-base-japanese``). A full list of different variants of
+these language models is available in the
+`official docs of Transformers library <https://huggingface.co/transformers/pretrained_models.html>_`.
 
 ``SpacyFeaturizer`` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`).
 So, this featurizer can also be an alternate option depending on the language of your training data.
 
-
 Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Depending on your data you may want to only perform intent classification or entity recognition.
 We support several components for each of the task. All of them are listed in :ref:`components`.
-We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`diet-selector`
+We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`response-selector`
 for response selection.
 
 Comparing different pipelines for your data

From f67b634de39ecab2dc7241cfa01dc428fd747774 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 14:07:33 +0100
Subject: [PATCH 488/633] made transformers lib optional and removed a few
 other deps

---
 alt_requirements/requirements_full.txt        |  3 +++
 ...nts_pretrained_embeddings_transformers.txt |  4 +++
 docs/nlu/components.rst                       |  3 +++
 .../nlu/utils/hugging_face/hf_transformers.py | 25 +++++++++++++------
 requirements.txt                              |  4 ---
 setup.py                                      |  3 ++-
 6 files changed, 29 insertions(+), 13 deletions(-)
 create mode 100644 alt_requirements/requirements_pretrained_embeddings_transformers.txt

diff --git a/alt_requirements/requirements_full.txt b/alt_requirements/requirements_full.txt
index e033e2544462..b97e60736d04 100644
--- a/alt_requirements/requirements_full.txt
+++ b/alt_requirements/requirements_full.txt
@@ -10,4 +10,7 @@
 # ConveRT Requirements
 -r requirements_pretrained_embeddings_convert.txt
 
+# Transformers Requirements
+-r requirements_pretrained_embeddings_transformers.txt
+
 jieba==0.39
diff --git a/alt_requirements/requirements_pretrained_embeddings_transformers.txt b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
new file mode 100644
index 000000000000..a513258cbe48
--- /dev/null
+++ b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
@@ -0,0 +1,4 @@
+# Minimum Install Requirements
+-r ../requirements.txt
+
+transformers==2.3.0
\ No newline at end of file
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c325eb052e5f..64069a6494d4 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -103,6 +103,9 @@ HFTransformersNLP
     featurization to compute sequence and sentence level representations for each example in the training data.
     Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
     component for downstream NLU models.
+
+    .. note:: To use ``HFTransformersNLP`` component, install Rasa OS with ``pip install rasa[transformers]``.
+
 :Configuration:
     .. code-block:: yaml
 
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 70a577ccfec0..554e4b0b1160 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -9,14 +9,6 @@
 import rasa.utils.train_utils as train_utils
 import numpy as np
 
-from rasa.nlu.utils.hugging_face.registry import (
-    model_class_dict,
-    model_tokenizer_dict,
-    model_weights_defaults,
-    model_special_tokens_pre_processors,
-    model_embeddings_post_processors,
-    model_tokens_cleaners,
-)
 from rasa.nlu.constants import (
     TEXT,
     LANGUAGE_MODEL_DOCS,
@@ -60,6 +52,12 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
     def _load_model(self) -> None:
         """Try loading the model"""
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_class_dict,
+            model_weights_defaults,
+            model_tokenizer_dict,
+        )
+
         self.model_name = self.component_config["model_name"]
 
         if self.model_name not in model_class_dict:
@@ -106,6 +104,10 @@ def _add_lm_specific_special_tokens(
         self, token_ids: List[List[int]]
     ) -> List[List[int]]:
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_special_tokens_pre_processors,
+        )
+
         augmented_tokens = [
             model_special_tokens_pre_processors[self.model_name](example_token_ids)
             for example_token_ids in token_ids
@@ -113,12 +115,19 @@ def _add_lm_specific_special_tokens(
         return augmented_tokens
 
     def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
+
+        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners
+
         return model_tokens_cleaners[self.model_name](token_strings)
 
     def _post_process_sequence_embeddings(
         self, sequence_embeddings: np.ndarray
     ) -> Tuple[np.ndarray, np.ndarray]:
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_embeddings_post_processors,
+        )
+
         sentence_embeddings = []
         post_processed_sequence_embeddings = []
 
diff --git a/requirements.txt b/requirements.txt
index 42947178522f..452957aea85f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,6 @@ absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
 # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
 setuptools >= 41.0.0
-tensor2tensor==1.14.0
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.4.0
@@ -62,7 +61,4 @@ PyJWT==1.7.1
 python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
-tensorflow_hub==0.7.0
 tensorflow-addons==0.7.0
-tensorflow-probability==0.7.0
-transformers==2.3.0
diff --git a/setup.py b/setup.py
index f4684ea9eb61..ba95e87f8fe9 100644
--- a/setup.py
+++ b/setup.py
@@ -85,13 +85,14 @@
     "SQLAlchemy~=1.3.0",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
-    "transformers~=2.3.0",
+    "tensorflow-addons==0.7.0",
 ]
 
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
     "convert": ["tensorflow_text~=2.1.0rc0", "tensorflow_hub~=0.7.0"],
+    "transformers": ["transformers~=2.3.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],

From 0223cf695f362524aabf104745064bd5ff23b9fa Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 14:07:33 +0100
Subject: [PATCH 489/633] made transformers lib optional and removed a few
 other deps

---
 alt_requirements/requirements_full.txt        |  3 +++
 ...nts_pretrained_embeddings_transformers.txt |  4 +++
 docs/nlu/components.rst                       |  3 +++
 .../nlu/utils/hugging_face/hf_transformers.py | 25 +++++++++++++------
 requirements.txt                              |  4 ---
 setup.py                                      |  3 ++-
 6 files changed, 29 insertions(+), 13 deletions(-)
 create mode 100644 alt_requirements/requirements_pretrained_embeddings_transformers.txt

diff --git a/alt_requirements/requirements_full.txt b/alt_requirements/requirements_full.txt
index e033e2544462..b97e60736d04 100644
--- a/alt_requirements/requirements_full.txt
+++ b/alt_requirements/requirements_full.txt
@@ -10,4 +10,7 @@
 # ConveRT Requirements
 -r requirements_pretrained_embeddings_convert.txt
 
+# Transformers Requirements
+-r requirements_pretrained_embeddings_transformers.txt
+
 jieba==0.39
diff --git a/alt_requirements/requirements_pretrained_embeddings_transformers.txt b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
new file mode 100644
index 000000000000..a513258cbe48
--- /dev/null
+++ b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
@@ -0,0 +1,4 @@
+# Minimum Install Requirements
+-r ../requirements.txt
+
+transformers==2.3.0
\ No newline at end of file
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c325eb052e5f..64069a6494d4 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -103,6 +103,9 @@ HFTransformersNLP
     featurization to compute sequence and sentence level representations for each example in the training data.
     Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
     component for downstream NLU models.
+
+    .. note:: To use ``HFTransformersNLP`` component, install Rasa OS with ``pip install rasa[transformers]``.
+
 :Configuration:
     .. code-block:: yaml
 
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 70a577ccfec0..554e4b0b1160 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -9,14 +9,6 @@
 import rasa.utils.train_utils as train_utils
 import numpy as np
 
-from rasa.nlu.utils.hugging_face.registry import (
-    model_class_dict,
-    model_tokenizer_dict,
-    model_weights_defaults,
-    model_special_tokens_pre_processors,
-    model_embeddings_post_processors,
-    model_tokens_cleaners,
-)
 from rasa.nlu.constants import (
     TEXT,
     LANGUAGE_MODEL_DOCS,
@@ -60,6 +52,12 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
     def _load_model(self) -> None:
         """Try loading the model"""
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_class_dict,
+            model_weights_defaults,
+            model_tokenizer_dict,
+        )
+
         self.model_name = self.component_config["model_name"]
 
         if self.model_name not in model_class_dict:
@@ -106,6 +104,10 @@ def _add_lm_specific_special_tokens(
         self, token_ids: List[List[int]]
     ) -> List[List[int]]:
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_special_tokens_pre_processors,
+        )
+
         augmented_tokens = [
             model_special_tokens_pre_processors[self.model_name](example_token_ids)
             for example_token_ids in token_ids
@@ -113,12 +115,19 @@ def _add_lm_specific_special_tokens(
         return augmented_tokens
 
     def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
+
+        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners
+
         return model_tokens_cleaners[self.model_name](token_strings)
 
     def _post_process_sequence_embeddings(
         self, sequence_embeddings: np.ndarray
     ) -> Tuple[np.ndarray, np.ndarray]:
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_embeddings_post_processors,
+        )
+
         sentence_embeddings = []
         post_processed_sequence_embeddings = []
 
diff --git a/requirements.txt b/requirements.txt
index 42947178522f..452957aea85f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,6 @@ absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
 # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
 setuptools >= 41.0.0
-tensor2tensor==1.14.0
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.4.0
@@ -62,7 +61,4 @@ PyJWT==1.7.1
 python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
-tensorflow_hub==0.7.0
 tensorflow-addons==0.7.0
-tensorflow-probability==0.7.0
-transformers==2.3.0
diff --git a/setup.py b/setup.py
index f4684ea9eb61..ba95e87f8fe9 100644
--- a/setup.py
+++ b/setup.py
@@ -85,13 +85,14 @@
     "SQLAlchemy~=1.3.0",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
-    "transformers~=2.3.0",
+    "tensorflow-addons==0.7.0",
 ]
 
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
     "convert": ["tensorflow_text~=2.1.0rc0", "tensorflow_hub~=0.7.0"],
+    "transformers": ["transformers~=2.3.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],

From f1855b8a0694adb4561c4bcd0378845f916ea51c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:15:30 +0100
Subject: [PATCH 490/633] review comments on docs

---
 docs/core/policies.rst           | 9 +++++----
 docs/migration-guide.rst         | 2 +-
 docs/nlu/choosing-a-pipeline.rst | 2 +-
 docs/nlu/components.rst          | 2 ++
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 86cf142380ce..60cade541abf 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -207,12 +207,13 @@ Embedding Policy
 TED Policy
 ^^^^^^^^^^
 
-Transformer Embedding Dialogue (TED) Policy is described in our paper https://arxiv.org/abs/1910.00486.
+Transformer Embedding Dialogue (TED) Policy is described in
+`our paper <https://github.com/PolyAI-LDN/polyai-models#convert>`__.
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
 
-    - concatenate user input (user intent and entities), previous system action, slots and active form for each time
+    - concatenate user input (user intent and entities), previous system actions, slots and active forms for each time
       step into an input vector to pre-transformer embedding layer;
     - feed it to transformer;
     - apply a dense layer to the output of the transformer to get embeddings of a dialogue for each time step;
@@ -350,7 +351,7 @@ It is recommended to use ``state_featurizer=LabelTokenizerSingleStateFeaturizer(
 
     .. code-block:: yaml
 
-        # nn architecture
+        # Architecture of the used neural network.
         # a list of hidden layers sizes before dialogue and action embed layers
         # number of hidden layers is equal to the length of this list
         "hidden_layers_sizes": {"dialogue": [], "label": []}
@@ -420,7 +421,7 @@ It is recommended to use ``state_featurizer=LabelTokenizerSingleStateFeaturizer(
 
     .. note::
 
-          Parameter ``maximum_negative_similarity`` is set to a negative value to mimic
+          The parameter ``maximum_negative_similarity`` is set to a negative value to mimic
           the original starspace algorithm in the case
           ``maximum_negative_similarity = maximum_positive_similarity`` and
           ``use_maximum_negative_similarity = False``. See
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index c66a07ef272f..0a429a92170e 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -72,7 +72,7 @@ General
   A warning will be logged in case an old option is used. However, you can still use the old configuration options.
   They will be mapped to the new names.
 
-- ``EmbeddingIntentClassifier`` is now deprecated and will be replaced by ``DIETClassifier`` in the future.
+- :ref:`embedding-intent-classifier` is now deprecated and will be replaced by :ref:`diet-classifier` in the future.
   ``DIETClassfier`` is based on a multi-task architecture for intent classification and entity recognition.
   However, if you want to get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use
   the following configuration of ``DIETClassifier``:
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index e3b11d756e77..463437642b35 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -72,7 +72,7 @@ in general English, the word "balance" is closely related to "symmetry", but ver
 banking domain, "balance" and "cash" are closely related and you'd like your model to capture that. If you don't
 use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language and domain.
 Thus, you should only use featurizers from the category `sparse` featuirzers, such as
-``CountVectorsFeaturizer`` or ``RegexFeaturizer``.
+``CountVectorsFeaturizer``, ``RegexFeaturizer`` or ``LexicalSyntacticFeaturizer``.
 
 The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:
 "I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model already knows that the
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index c325eb052e5f..17121356a6aa 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -713,6 +713,8 @@ SklearnIntentClassifier
           # This is used with the ``C`` hyperparameter in GridSearchCV.
           kernels: ["linear"]
 
+.. _embedding-intent-classifier:
+
 EmbeddingIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

From a45468a3e75d3a5c0458cbe3baff21a8af488444 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 14:15:31 +0100
Subject: [PATCH 491/633] Update docs/nlu/components.rst

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 docs/nlu/components.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 64069a6494d4..434ea71c2db9 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -104,7 +104,7 @@ HFTransformersNLP
     Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
     component for downstream NLU models.
 
-    .. note:: To use ``HFTransformersNLP`` component, install Rasa OS with ``pip install rasa[transformers]``.
+     .. note:: To use ``HFTransformersNLP`` component, install Rasa Open Source with ``pip install rasa[transformers]``.
 
 :Configuration:
     .. code-block:: yaml
@@ -1728,4 +1728,4 @@ DIETClassifier
             # BILOU_flag determines whether to use BILOU tagging or not.
             # More rigorous however requires more examples per entity
             # rule of thumb: use only if more than 100 egs. per entity
-            "BILOU_flag": True
\ No newline at end of file
+            "BILOU_flag": True

From b82ff9b40f19f50f7038f8ea7648b81ee512ebb4 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 14:19:21 +0100
Subject: [PATCH 492/633] fix link in docs

---
 docs/nlu/choosing-a-pipeline.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 174b3beb018d..f541b79253b3 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -108,8 +108,8 @@ Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Depending on your data you may want to only perform intent classification or entity recognition.
 We support several components for each of the task. All of them are listed in :ref:`components`.
-We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`diet-selector`
-for response selection.
+We recommend to use :ref:`diet-classifier` for intent classification and entity recognition
+and :ref:`response-selector` for response selection.
 
 Comparing different pipelines for your data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 2e28cc25a01758b6b3e692fb4e2ff715f54d49f1 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 13:41:55 +0100
Subject: [PATCH 493/633] remove diet selector

---
 data/configs_for_docs/default_config.yml      |   2 +-
 .../default_english_config.yml                |   2 +-
 .../config_pretrained_embeddings_convert.yml  |   2 +-
 .../config_pretrained_embeddings_spacy_de.yml |   2 +-
 .../config_pretrained_embeddings_spacy_en.yml |   2 +-
 .../config_supervised_embeddings_duckling.yml |   2 +-
 docs/migration-guide.rst                      |  18 -
 docs/nlu/components.rst                       | 121 +----
 rasa/cli/initial_project/config.yml           |   2 +-
 rasa/nlu/registry.py                          |   2 -
 rasa/nlu/selectors/diet_selector.py           | 427 ------------------
 rasa/nlu/selectors/response_selector.py       | 213 ++++++++-
 tests/nlu/selectors/test_selectors.py         |   2 +-
 tests/nlu/test_evaluation.py                  |   2 +-
 tests/nlu/test_train.py                       |   6 -
 tests/utilities.py                            |   2 -
 16 files changed, 207 insertions(+), 600 deletions(-)
 delete mode 100644 rasa/nlu/selectors/diet_selector.py

diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
index ce7cdc028443..a9671b2a8c65 100644
--- a/data/configs_for_docs/default_config.yml
+++ b/data/configs_for_docs/default_config.yml
@@ -32,4 +32,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: ResponseSelector
\ No newline at end of file
diff --git a/data/configs_for_docs/default_english_config.yml b/data/configs_for_docs/default_english_config.yml
index 1f264cc796ed..366e2bc9aac9 100644
--- a/data/configs_for_docs/default_english_config.yml
+++ b/data/configs_for_docs/default_english_config.yml
@@ -12,4 +12,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: ResponseSelector
\ No newline at end of file
diff --git a/docker/configs/config_pretrained_embeddings_convert.yml b/docker/configs/config_pretrained_embeddings_convert.yml
index c059db91d4db..ee0da9bfab1d 100644
--- a/docker/configs/config_pretrained_embeddings_convert.yml
+++ b/docker/configs/config_pretrained_embeddings_convert.yml
@@ -12,4 +12,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_de.yml b/docker/configs/config_pretrained_embeddings_spacy_de.yml
index 5a5c1b64d985..c5068fe6377e 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_de.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_de.yml
@@ -13,4 +13,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_en.yml b/docker/configs/config_pretrained_embeddings_spacy_en.yml
index 697a4d9eae09..b6591e42bc97 100644
--- a/docker/configs/config_pretrained_embeddings_spacy_en.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_en.yml
@@ -13,4 +13,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
diff --git a/docker/configs/config_supervised_embeddings_duckling.yml b/docker/configs/config_supervised_embeddings_duckling.yml
index 0637f14f706b..7dbecba7acb9 100644
--- a/docker/configs/config_supervised_embeddings_duckling.yml
+++ b/docker/configs/config_supervised_embeddings_duckling.yml
@@ -11,6 +11,6 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
   - name: DucklingHTTPExtractor
     url: "http://duckling:8000"
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 0a429a92170e..6d86a9c01480 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -131,24 +131,6 @@ General
   ``DIETClassifier``. For more information about the ``DIETClassifier`` and the ``LexicalSyntacticFeaturizer``
   see :ref:`components`.
 
-- ``ResponseSelector`` is now deprecated and will be replaced by ``DIETSelector`` in the future. If you want to
-  get the same model behaviour as the current ``ResponseSelector``, you can use the following configuration of
-  ``DIETSelector``:
-
-  .. code-block:: yaml
-
-    pipeline:
-    # - ... other components
-    - name: DIETSelector
-      intent_classification: True
-      entity_recognition: False
-      use_masked_language_model: False
-      BILOU_flag: False
-      number_of_transformer_layers: 0
-      # ... any other parameters
-
-  See :ref:`diet-selector` for more information about the new component.
-
 .. _migration-to-rasa-1.7:
 
 Rasa 1.6 to Rasa 1.7
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 17121356a6aa..cff3977e6cd8 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -969,121 +969,6 @@ ResponseSelector
     Response Selector component can be used to build a response retrieval model to directly predict a bot response from
     a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
     It embeds user inputs and response labels into the same space and follows the exact same
-    neural network architecture and optimization as the ``EmbeddingIntentClassifier``.
-
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty response ``None`` is predicted with confidence ``0.0``.
-
-    .. warning::
-        ``ResponseSelector`` is deprecated and should be replaced by ``DIETSelector``. See
-        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
-
-:Configuration:
-
-    The algorithm includes all the hyperparameters that ``EmbeddingIntentClassifier`` uses.
-    In addition, the component can also be configured to train a response selector for a particular retrieval intent.
-
-        - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
-
-    Default values:
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "ResponseSelector"
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [], "label": []}
-            # Whether to share the hidden layer weights between input words and labels
-            "share_hidden_layers": False
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
-            "batch_size": [64, 256]
-            # how to create batches
-            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-            # number of epochs
-            "epochs": 300
-            # set random seed to any int to get reproducible results
-            "random_seed": None
-            # optimizer
-            "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512, "label": 512}
-            # dimension size of embedding vectors
-            "embedding_dimension": 20
-            # the type of the similarity
-            "number_of_negative_examples": 20
-            # flag if minimize only maximum similarity over incorrect actions
-            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-            # the type of the loss function
-            "loss_type": "softmax"  # string 'softmax' or 'margin'
-            # number of top intents to normalize scores for softmax loss_type
-            # set to 0 to turn off normalization
-            "ranking_length": 10
-            # how similar the algorithm should try
-            # to make embedding vectors for correct labels
-            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-            # maximum negative similarity for incorrect labels
-            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
-            # flag: if true, only minimize the maximum similarity for incorrect labels
-            "use_maximum_negative_similarity": True
-            # scale loss inverse proportionally to confidence of correct prediction
-            "scale_loss": True
-            # regularization parameters
-            # the scale of regularization
-            "regularization_constant": 0.002
-            # the scale of how critical the algorithm should be of minimizing the
-            # maximum similarity between embeddings of different labels
-            "negative_margin_scale": 0.8
-            # dropout rate for rnn
-            "droprate": 0.2
-            # if true apply dropout to sparse tensors
-            "use_sparse_input_dropout": True
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
-            # selector config
-            # name of the intent for which this response selector is to be trained
-            "retrieval_intent": None
-
-
-.. _diet-selector:
-
-DIETSelector
-~~~~~~~~~~~~~~~~
-
-:Short: DIET Selector
-:Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user message and response
-
-:Output-Example:
-
-    .. code-block:: json
-
-        {
-            "response_selector": {
-              "faq": {
-                "response": {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
-                "ranking": [
-                    {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
-                    {"confidence": 0.2134543431, "name": "You can ask me about how to get started"}
-                ]
-              }
-            }
-        }
-
-:Description:
-
-    DIET Selector component can be used to build a response retrieval model to directly predict a bot response from
-    a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
-    It embeds user inputs and response labels into the same space and follows the exact same
     neural network architecture and optimization as the ``DIETClassifier``.
 
     .. note:: If during prediction time a message contains **only** words unseen during training,
@@ -1107,13 +992,13 @@ DIETSelector
             # sizes of hidden layers before the embedding layer
             # for input words and intent labels,
             # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [], "label": []}
+            "hidden_layers_sizes": {"text": [256, 128], "label": [256, 128]}
             # Whether to share the hidden layer weights between input words and labels
             "share_hidden_layers": False
             # number of units in transformer
-            "transformer_size": 256
+            "transformer_size": None
             # number of transformer layers
-            "number_of_transformer_layers": 2
+            "number_of_transformer_layers": 0
             # number of attention heads in transformer
             "number_of_attention_heads": 4
             # max sequence length
diff --git a/rasa/cli/initial_project/config.yml b/rasa/cli/initial_project/config.yml
index f1088d9a442a..dedb0b714890 100644
--- a/rasa/cli/initial_project/config.yml
+++ b/rasa/cli/initial_project/config.yml
@@ -12,7 +12,7 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
 
 # Configuration for Rasa Core.
 # https://rasa.com/docs/rasa/core/policies/
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index f9b3a2217f4d..f00cd71e823f 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -33,7 +33,6 @@
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.selectors.response_selector import ResponseSelector
-from rasa.nlu.selectors.diet_selector import DIETSelector
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
@@ -93,7 +92,6 @@
     EmbeddingIntentClassifier,
     # selectors
     ResponseSelector,
-    DIETSelector,
 ]
 
 # Mapping from a components name to its class to allow name based lookup.
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
deleted file mode 100644
index a6e691a7c8cb..000000000000
--- a/rasa/nlu/selectors/diet_selector.py
+++ /dev/null
@@ -1,427 +0,0 @@
-import logging
-
-import numpy as np
-import tensorflow as tf
-
-from typing import Any, Dict, Optional, Text, Tuple, Union
-
-from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
-from rasa.nlu.components import any_of
-from rasa.utils.tensorflow.constants import (
-    LABEL,
-    HIDDEN_LAYERS_SIZES,
-    SHARE_HIDDEN_LAYERS,
-    TRANSFORMER_SIZE,
-    NUM_TRANSFORMER_LAYERS,
-    NUM_HEADS,
-    MAX_SEQ_LENGTH,
-    BATCH_SIZES,
-    BATCH_STRATEGY,
-    EPOCHS,
-    RANDOM_SEED,
-    LEARNING_RATE,
-    DENSE_DIM,
-    RANKING_LENGTH,
-    LOSS_TYPE,
-    SIMILARITY_TYPE,
-    NUM_NEG,
-    SPARSE_INPUT_DROPOUT,
-    MASKED_LM,
-    ENTITY_RECOGNITION,
-    INTENT_CLASSIFICATION,
-    EVAL_NUM_EXAMPLES,
-    EVAL_NUM_EPOCHS,
-    UNIDIRECTIONAL_ENCODER,
-    DROPRATE,
-    DROPRATE_ATTENTION,
-    NEG_MARGIN_SCALE,
-    REGULARIZATION_CONSTANT,
-    SCALE_LOSS,
-    EMBED_DIM,
-    BILOU_FLAG,
-    KEY_RELATIVE_ATTENTION,
-    VALUE_RELATIVE_ATTENTION,
-    MAX_RELATIVE_POSITION,
-    USE_MAX_NEG_SIM,
-    MAX_NEG_SIM,
-    MAX_POS_SIM,
-)
-from rasa.nlu.constants import (
-    RESPONSE,
-    RESPONSE_SELECTOR_PROPERTY_NAME,
-    DEFAULT_OPEN_UTTERANCE_TYPE,
-    DENSE_FEATURE_NAMES,
-    TEXT,
-    SPARSE_FEATURE_NAMES,
-)
-from rasa.utils.tensorflow.model_data import RasaModelData
-from rasa.utils.tensorflow.models import RasaModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class DIETSelector(DIETClassifier):
-    """Response selector using supervised embeddings.
-
-    The response selector embeds user inputs
-    and candidate response into the same space.
-    Supervised embeddings are trained by maximizing similarity between them.
-    It also provides rankings of the response that did not "win".
-
-    The supervised response selector needs to be preceded by
-    a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that
-    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
-    However, in this implementation the `mu` parameter is treated differently
-    and additional hidden layers are added together with dropout.
-    """
-
-    provides = [RESPONSE, "response_ranking"]
-
-    requires = [
-        any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT]),
-        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
-    ]
-
-    # default properties (DOC MARKER - don't remove)
-    defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer
-        # for input words and responses
-        # the number of hidden layers is thus equal to the length of this list
-        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
-        # Whether to share the hidden layer weights between input words and intent labels
-        SHARE_HIDDEN_LAYERS: False,
-        # number of units in transformer
-        TRANSFORMER_SIZE: None,
-        # number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 0,
-        # number of attention heads in transformer
-        NUM_HEADS: 4,
-        # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
-        BATCH_SIZES: [64, 256],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
-        EPOCHS: 300,
-        # set random seed to any int to get reproducible results
-        RANDOM_SEED: None,
-        # optimizer
-        LEARNING_RATE: 0.001,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512, LABEL: 512},
-        # dimension size of embedding vectors
-        EMBED_DIM: 20,
-        # the type of the similarity
-        NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
-        # number of top responses to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
-        RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct intent labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect intent labels
-        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for
-        # incorrect intent labels
-        USE_MAX_NEG_SIM: True,
-        # scale loss inverse proportionally to confidence of correct prediction
-        SCALE_LOSS: True,
-        # regularization parameters
-        # the scale of L2 regularization
-        REGULARIZATION_CONSTANT: 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different intent labels
-        NEG_MARGIN_SCALE: 0.8,
-        # dropout rate for rnn
-        DROPRATE: 0.2,
-        # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
-        # use a unidirectional or bidirectional encoder
-        UNIDIRECTIONAL_ENCODER: False,
-        # if true apply dropout to sparse tensors
-        SPARSE_INPUT_DROPOUT: False,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
-        # if true random tokens of the input message will be masked and the model
-        # should predict those tokens
-        MASKED_LM: False,
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
-        # selector config
-        # name of the intent for which this response selector is to be trained
-        "retrieval_intent": None,
-    }
-    # end default properties (DOC MARKER - don't remove)
-
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
-        model: Optional[RasaModel] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
-    ) -> None:
-
-        component_config = component_config or {}
-
-        # the following properties cannot be adapted for the ResponseSelector
-        component_config[INTENT_CLASSIFICATION] = True
-        component_config[ENTITY_RECOGNITION] = False
-        component_config[BILOU_FLAG] = False
-
-        super().__init__(
-            component_config,
-            inverted_label_dict,
-            inverted_tag_dict,
-            model,
-            batch_tuple_sizes,
-        )
-
-    @property
-    def label_key(self) -> Text:
-        return "label_ids"
-
-    @staticmethod
-    def model_class():
-        return DIET2DIET
-
-    def _load_selector_params(self, config: Dict[Text, Any]) -> None:
-        self.retrieval_intent = config["retrieval_intent"]
-        if not self.retrieval_intent:
-            # retrieval intent was left to its default value
-            logger.info(
-                "Retrieval intent parameter was left to its default value. This "
-                "response selector will be trained on training examples combining "
-                "all retrieval intents."
-            )
-
-    def _check_config_parameters(self) -> None:
-        super()._check_config_parameters()
-        self._load_selector_params(self.component_config)
-
-    @staticmethod
-    def _set_message_property(
-        message: Message, prediction_dict: Dict[Text, Any], selector_key: Text
-    ) -> None:
-
-        message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
-        message_selector_properties[selector_key] = prediction_dict
-        message.set(
-            RESPONSE_SELECTOR_PROPERTY_NAME,
-            message_selector_properties,
-            add_to_output=True,
-        )
-
-    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
-        """Performs sanity checks on training data, extracts encodings for labels
-        and prepares data for training"""
-        if self.retrieval_intent:
-            training_data = training_data.filter_by_intent(self.retrieval_intent)
-
-        label_id_dict = self._create_label_id_dict(training_data, attribute=RESPONSE)
-        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-
-        self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=RESPONSE
-        )
-
-        model_data = self._create_model_data(
-            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE
-        )
-
-        self.check_input_dimension_consistency(model_data)
-
-        return model_data
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-        """Return the most likely response and its similarity to the input."""
-
-        out = self._predict(message)
-        label, label_ranking = self._predict_label(out)
-
-        selector_key = (
-            self.retrieval_intent
-            if self.retrieval_intent
-            else DEFAULT_OPEN_UTTERANCE_TYPE
-        )
-
-        logger.debug(
-            f"Adding following selector key to message property: {selector_key}"
-        )
-
-        prediction_dict = {"response": label, "ranking": label_ranking}
-
-        self._set_message_property(message, prediction_dict, selector_key)
-
-
-class DIET2DIET(DIET):
-    def _check_data(self) -> None:
-        if "text_features" not in self.data_signature:
-            raise ValueError(
-                f"No text features specified. "
-                f"Cannot train '{self.__class__.__name__}' model."
-            )
-        if "label_features" not in self.data_signature:
-            raise ValueError(
-                f"No label features specified. "
-                f"Cannot train '{self.__class__.__name__}' model."
-            )
-        if (
-            self.config[SHARE_HIDDEN_LAYERS]
-            and self.data_signature["text_features"]
-            != self.data_signature["label_features"]
-        ):
-            raise ValueError(
-                "If hidden layer weights are shared, data signatures "
-                "for text_features and label_features must coincide."
-            )
-
-    def _create_metrics(self) -> None:
-        # self.metrics preserve order
-        # output losses first
-        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
-        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
-        # output accuracies second
-        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
-        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
-
-    def _update_metrics_to_log(self) -> None:
-        if self.config[MASKED_LM]:
-            self.metrics_to_log += ["m_loss", "m_acc"]
-
-        self.metrics_to_log += ["r_loss", "r_acc"]
-
-    def _prepare_layers(self) -> None:
-        self.text_name = TEXT
-        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
-
-        self._prepare_sequence_layers(self.text_name)
-        self._prepare_sequence_layers(self.label_name)
-        if self.config[MASKED_LM]:
-            self._prepare_mask_lm_layers(self.text_name)
-        self._prepare_label_classification_layers()
-
-    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_label_ids = self.tf_label_data["label_ids"][0]
-
-        mask_label = self.tf_label_data["label_mask"][0]
-        sequence_lengths_label = self._get_sequence_lengths(mask_label)
-
-        label_transformed, _, _, _ = self._create_sequence(
-            self.tf_label_data["label_features"], mask_label, self.label_name
-        )
-        cls_label = self._last_token(label_transformed, sequence_lengths_label)
-
-        all_labels_embed = self._tf_layers["embed.label"](cls_label)
-
-        return all_label_ids, all_labels_embed
-
-    def batch_loss(
-        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
-    ) -> tf.Tensor:
-        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths_text = self._get_sequence_lengths(mask_text)
-
-        (
-            text_transformed,
-            text_in,
-            text_seq_ids,
-            lm_mask_bool_text,
-        ) = self._create_sequence(
-            tf_batch_data["text_features"],
-            mask_text,
-            self.text_name,
-            self.config[MASKED_LM],
-            sequence_ids=True,
-        )
-
-        mask_label = tf_batch_data["label_mask"][0]
-        sequence_lengths_label = self._get_sequence_lengths(mask_label)
-
-        label_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["label_features"], mask_label, self.label_name
-        )
-
-        losses = []
-
-        if self.config[MASKED_LM]:
-            loss, acc = self._mask_loss(
-                text_transformed,
-                text_in,
-                text_seq_ids,
-                lm_mask_bool_text,
-                self.text_name,
-            )
-
-            self.mask_loss.update_state(loss)
-            self.mask_acc.update_state(acc)
-            losses.append(loss)
-
-        # get _cls_ vector for label classification
-        cls_text = self._last_token(text_transformed, sequence_lengths_text)
-        cls_label = self._last_token(label_transformed, sequence_lengths_label)
-        label_ids = tf_batch_data["label_ids"][0]
-
-        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
-        self.response_loss.update_state(loss)
-        self.response_acc.update_state(acc)
-        losses.append(loss)
-
-        return tf.math.add_n(losses)
-
-    def batch_predict(
-        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
-    ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = self.batch_to_model_data_format(
-            batch_in, self.predict_data_signature
-        )
-
-        mask_text = tf_batch_data["text_mask"][0]
-        sequence_lengths_text = self._get_sequence_lengths(mask_text)
-
-        text_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, self.text_name
-        )
-
-        out = {}
-
-        if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels()
-
-        # get _cls_ vector for intent classification
-        cls = self._last_token(text_transformed, sequence_lengths_text)
-        cls_embed = self._tf_layers["embed.text"](cls)
-
-        sim_all = self._tf_layers["loss.label"].sim(
-            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
-        )
-        scores = self._tf_layers["loss.label"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
-        out["i_scores"] = scores
-
-        return out
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 5df777521d79..3d9c99bace1b 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -1,15 +1,21 @@
 import logging
 
-from typing import Any, Dict, Optional, Text
+import numpy as np
+import tensorflow as tf
+
+from typing import Any, Dict, Optional, Text, Tuple, Union
 
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
 from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    MAX_SEQ_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -26,15 +32,20 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
     DROPRATE,
+    DROPRATE_ATTENTION,
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
+    EMBED_DIM,
+    BILOU_FLAG,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
-    BILOU_FLAG,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -46,8 +57,7 @@
 )
 from rasa.utils.tensorflow.model_data import RasaModelData
 from rasa.utils.tensorflow.models import RasaModel
-from rasa.utils.common import raise_warning
-from rasa.constants import DOCS_URL_COMPONENTS
+
 
 logger = logging.getLogger(__name__)
 
@@ -78,16 +88,23 @@ class ResponseSelector(DIETClassifier):
         any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
     ]
 
-    # please make sure to update the docs when changing a default parameter
+    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
         # for input words and responses
         # the number of hidden layers is thus equal to the length of this list
         HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
-        # Whether to share the hidden layer weights between input words and intent
-        # labels
+        # Whether to share the hidden layer weights between input words and intent labels
         SHARE_HIDDEN_LAYERS: False,
+        # number of units in transformer
+        TRANSFORMER_SIZE: None,
+        # number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 0,
+        # number of attention heads in transformer
+        NUM_HEADS: 4,
+        # max sequence length if pos_encoding='emb'
+        MAX_SEQ_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -132,6 +149,10 @@ class ResponseSelector(DIETClassifier):
         NEG_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
         DROPRATE: 0.2,
+        # dropout rate for attention
+        DROPRATE_ATTENTION: 0,
+        # use a unidirectional or bidirectional encoder
+        UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
@@ -139,10 +160,20 @@ class ResponseSelector(DIETClassifier):
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
         # how many examples to use for calculation of training accuracy
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
+        # if true random tokens of the input message will be masked and the model
+        # should predict those tokens
+        MASKED_LM: False,
+        # if true use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # if true use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
         # selector config
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
     }
+    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
@@ -159,8 +190,6 @@ def __init__(
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
         component_config[BILOU_FLAG] = False
-        component_config[MASKED_LM] = False
-        component_config[NUM_TRANSFORMER_LAYERS] = 0
 
         super().__init__(
             component_config,
@@ -170,17 +199,14 @@ def __init__(
             batch_tuple_sizes,
         )
 
-        raise_warning(
-            f"'ResponseSelector' is deprecated and will be removed in version 2.0. "
-            f"Use 'DIETSelector' instead.",
-            category=FutureWarning,
-            docs=DOCS_URL_COMPONENTS,
-        )
-
     @property
     def label_key(self) -> Text:
         return "label_ids"
 
+    @staticmethod
+    def model_class():
+        return DIET2DIET
+
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
         self.retrieval_intent = config["retrieval_intent"]
         if not self.retrieval_intent:
@@ -248,3 +274,154 @@ def process(self, message: Message, **kwargs: Any) -> None:
         prediction_dict = {"response": label, "ranking": label_ranking}
 
         self._set_message_property(message, prediction_dict, selector_key)
+
+
+class DIET2DIET(DIET):
+    def _check_data(self) -> None:
+        if "text_features" not in self.data_signature:
+            raise ValueError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if "label_features" not in self.data_signature:
+            raise ValueError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if (
+            self.config[SHARE_HIDDEN_LAYERS]
+            and self.data_signature["text_features"]
+            != self.data_signature["label_features"]
+        ):
+            raise ValueError(
+                "If hidden layer weights are shared, data signatures "
+                "for text_features and label_features must coincide."
+            )
+
+    def _create_metrics(self) -> None:
+        # self.metrics preserve order
+        # output losses first
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
+        # output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
+
+    def _update_metrics_to_log(self) -> None:
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+
+        self.metrics_to_log += ["r_loss", "r_acc"]
+
+    def _prepare_layers(self) -> None:
+        self.text_name = TEXT
+        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
+
+        self._prepare_sequence_layers(self.text_name)
+        self._prepare_sequence_layers(self.label_name)
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers(self.text_name)
+        self._prepare_label_classification_layers()
+
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label_ids = self.tf_label_data["label_ids"][0]
+
+        mask_label = self.tf_label_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            self.tf_label_data["label_features"], mask_label, self.label_name
+        )
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+
+        all_labels_embed = self._tf_layers["embed.label"](cls_label)
+
+        return all_label_ids, all_labels_embed
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data["text_features"],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
+        )
+
+        mask_label = tf_batch_data["label_mask"][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["label_features"], mask_label, self.label_name
+        )
+
+        losses = []
+
+        if self.config[MASKED_LM]:
+            loss, acc = self._mask_loss(
+                text_transformed,
+                text_in,
+                text_seq_ids,
+                lm_mask_bool_text,
+                self.text_name,
+            )
+
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
+            losses.append(loss)
+
+        # get _cls_ vector for label classification
+        cls_text = self._last_token(text_transformed, sequence_lengths_text)
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+        label_ids = tf_batch_data["label_ids"][0]
+
+        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
+        self.response_loss.update_state(loss)
+        self.response_acc.update_state(acc)
+        losses.append(loss)
+
+        return tf.math.add_n(losses)
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
+        )
+
+        mask_text = tf_batch_data["text_mask"][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data["text_features"], mask_text, self.text_name
+        )
+
+        out = {}
+
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels()
+
+        # get _cls_ vector for intent classification
+        cls = self._last_token(text_transformed, sequence_lengths_text)
+        cls_embed = self._tf_layers["embed.text"](cls)
+
+        sim_all = self._tf_layers["loss.label"].sim(
+            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
+        )
+        scores = self._tf_layers["loss.label"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+        out["i_scores"] = scores
+
+        return out
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 02fd54764ba3..333ac185aaf7 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -17,7 +17,7 @@
         [
             {"name": "WhitespaceTokenizer"},
             {"name": "CountVectorsFeaturizer"},
-            {"name": "DIETSelector", EPOCHS: 2},
+            {"name": "ResponseSelector", EPOCHS: 2},
         ],
     ],
 )
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index f8ac3176d115..03e900d8d48d 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -317,7 +317,7 @@ def test_run_cv_evaluation_with_response_selector():
                 {"name": "WhitespaceTokenizer"},
                 {"name": "CountVectorsFeaturizer"},
                 {"name": "DIETClassifier", EPOCHS: 2},
-                {"name": "DIETSelector", EPOCHS: 2},
+                {"name": "ResponseSelector", EPOCHS: 2},
             ],
         }
     )
diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index 9037d0c7971d..bdd0d196ecba 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -41,7 +41,6 @@ def pipelines_for_tests():
                 "DIETClassifier",
                 "EmbeddingIntentClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -60,7 +59,6 @@ def pipelines_for_tests():
                 "SklearnIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -77,7 +75,6 @@ def pipelines_for_tests():
                 "DucklingHTTPExtractor",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -93,7 +90,6 @@ def pipelines_for_tests():
                 "DucklingHTTPExtractor",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -110,7 +106,6 @@ def pipelines_for_tests():
                 "MitieIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
@@ -126,7 +121,6 @@ def pipelines_for_tests():
                 "MitieIntentClassifier",
                 "DIETClassifier",
                 "ResponseSelector",
-                "DIETSelector",
                 "EntitySynonymMapper",
             ),
         ),
diff --git a/tests/utilities.py b/tests/utilities.py
index f776052ac9be..70637933b192 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -4,7 +4,6 @@
 import rasa.utils.io as io_utils
 from nlu.classifiers.diet_classifier import DIETClassifier
 from nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from nlu.selectors.diet_selector import DIETSelector
 from nlu.selectors.response_selector import ResponseSelector
 from utils.tensorflow.constants import EPOCHS
 
@@ -32,7 +31,6 @@ def update_number_of_epochs(config_path: Text, output_file: Text):
             EmbeddingIntentClassifier.name,
             DIETClassifier.name,
             ResponseSelector.name,
-            DIETSelector.name,
         ]:
             component[EPOCHS] = 2
 

From f62c65b65be43fcdc8ac9d53cf9790f87d6e1eba Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 13:55:42 +0100
Subject: [PATCH 494/633] remove doc markers

---
 rasa/core/policies/embedding_policy.py  | 2 --
 rasa/core/policies/ted_policy.py        | 2 --
 rasa/nlu/classifiers/diet_classifier.py | 2 --
 rasa/nlu/selectors/response_selector.py | 2 --
 4 files changed, 8 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index a500abca6f48..2a12f2f0a454 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -48,7 +48,6 @@ class EmbeddingPolicy(TEDPolicy):
     The policy used in our paper https://arxiv.org/abs/1910.00486
     """
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # a list of hidden layers sizes before dialogue and action embed layers
@@ -118,7 +117,6 @@ class EmbeddingPolicy(TEDPolicy):
         # max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 9d65e00901e7..95754a41d4b7 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -69,7 +69,6 @@ class TEDPolicy(Policy):
 
     SUPPORTS_ONLINE_TRAINING = True
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # a list of hidden layers sizes before dialogue and action embed layers
@@ -139,7 +138,6 @@ class TEDPolicy(Policy):
         # how many examples to use for hold out validation set
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
     }
-    # end default properties (DOC MARKER - don't remove)
 
     @staticmethod
     def _standard_featurizer(max_history: Optional[int] = None) -> TrackerFeaturizer:
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 935c8356f7ca..472f57a00e74 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -101,7 +101,6 @@ class DIETClassifier(EntityExtractor):
 
     requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
@@ -191,7 +190,6 @@ class DIETClassifier(EntityExtractor):
         # rule of thumb: use only if more than 100 egs. per entity
         BILOU_FLAG: True,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     # init helpers
     def _check_config_parameters(self) -> None:
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 3d9c99bace1b..6214800d7cbf 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -88,7 +88,6 @@ class ResponseSelector(DIETClassifier):
         any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
     ]
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
         # sizes of hidden layers before the embedding layer
@@ -173,7 +172,6 @@ class ResponseSelector(DIETClassifier):
         # name of the intent for which this response selector is to be trained
         "retrieval_intent": None,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,

From a8870e30b9ce7dc23a2dcd7076bbb37adae3ae9b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 20 Feb 2020 14:19:21 +0100
Subject: [PATCH 495/633] fix link in docs

---
 docs/nlu/choosing-a-pipeline.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 463437642b35..bcd5bb0d4658 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -108,8 +108,8 @@ Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Depending on your data you may want to only perform intent classification or entity recognition.
 We support several components for each of the task. All of them are listed in :ref:`components`.
-We recommend to use :ref:`diet-classifier` for intent classification and entity recognition and :ref:`response-selector`
-for response selection.
+We recommend to use :ref:`diet-classifier` for intent classification and entity recognition
+and :ref:`response-selector` for response selection.
 
 Comparing different pipelines for your data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From dbb66359780eb7b9d6682ae93eda071372126cc3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:33:14 +0100
Subject: [PATCH 496/633] review comments

---
 docs/nlu/components.rst                       |  6 +--
 rasa/core/policies/embedding_policy.py        | 10 ++---
 rasa/core/policies/keras_policy.py            |  4 +-
 rasa/core/policies/ted_policy.py              | 26 ++++++-----
 rasa/nlu/classifiers/diet_classifier.py       | 44 ++++++++++++-------
 .../embedding_intent_classifier.py            | 10 ++---
 .../classifiers/sklearn_intent_classifier.py  |  2 +-
 rasa/nlu/config.py                            |  2 +-
 rasa/nlu/extractors/crf_entity_extractor.py   |  2 +-
 .../dense_featurizer/mitie_featurizer.py      |  2 +-
 rasa/nlu/selectors/diet_selector.py           | 12 ++---
 rasa/nlu/selectors/response_selector.py       |  8 ++--
 rasa/utils/tensorflow/constants.py            |  6 +--
 rasa/utils/train_utils.py                     | 12 ++---
 14 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 17121356a6aa..5a3386198d7c 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -753,7 +753,7 @@ EmbeddingIntentClassifier
 
     .. warning::
         ``EmbeddingIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
-        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
+        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
 
 :Configuration:
 
@@ -977,7 +977,7 @@ ResponseSelector
 
     .. warning::
         ``ResponseSelector`` is deprecated and should be replaced by ``DIETSelector``. See
-        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
+        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
 
 :Configuration:
 
@@ -1347,7 +1347,7 @@ CRFEntityExtractor
 
     .. warning::
         ``CRFEntityExtractor`` is deprecated and should be replaced by ``DIETClassifier``. See
-        `migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_ for more details.
+        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
 
 :Configuration:
     You need to configure what kind of features the CRF should use.
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index a500abca6f48..184725727745 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -11,7 +11,7 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
+    MAX_SEQUENCE_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -28,7 +28,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     DROPRATE_ATTENTION,
@@ -59,7 +59,7 @@ class EmbeddingPolicy(TEDPolicy):
         # number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
         # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
+        MAX_SEQUENCE_LENGTH: 256,
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # training parameters
@@ -74,7 +74,7 @@ class EmbeddingPolicy(TEDPolicy):
         RANDOM_SEED: None,
         # embedding parameters
         # dimension size of embedding vectors
-        EMBED_DIM: 20,
+        EMBEDDING_DIMENSION: 20,
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect labels
@@ -135,5 +135,5 @@ def __init__(
             f"'EmbeddingPolicy' is deprecated and will be removed in version 2.0. "
             f"Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs=DOCS_URL_POLICIES,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
         )
diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index a6497f6c5ce0..4e00b3cb2f5d 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -75,7 +75,7 @@ def __init__(
             "'KerasPolicy' is deprecated and will be removed in version "
             "2.0. Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs=DOCS_URL_POLICIES,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
         )
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
@@ -177,7 +177,7 @@ def train(
                 shuffled_X.shape[1:], shuffled_y.shape[1:]
             )
 
-        logger.info(
+        logger.debug(
             "Fitting model with {} total samples and a "
             "validation split of {}"
             "".format(training_data.num_examples(), self.validation_split)
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 9d65e00901e7..f0a2c13087af 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -31,7 +31,7 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
+    MAX_SEQUENCE_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
@@ -48,7 +48,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     DROPRATE_ATTENTION,
@@ -80,7 +80,7 @@ class TEDPolicy(Policy):
         # number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
         # max sequence length
-        MAX_SEQ_LENGTH: 256,
+        MAX_SEQUENCE_LENGTH: 256,
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # if true use key relative embeddings in attention
@@ -101,7 +101,7 @@ class TEDPolicy(Policy):
         RANDOM_SEED: None,
         # embedding parameters
         # dimension size of embedding vectors
-        EMBED_DIM: 20,
+        EMBEDDING_DIMENSION: 20,
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect labels
@@ -158,7 +158,7 @@ def __init__(
         model: Optional[RasaModel] = None,
         **kwargs: Dict[Text, Any],
     ) -> None:
-        """Declare instant variables with default values"""
+        """Declare instance variables with default values"""
 
         if not featurizer:
             featurizer = self._standard_featurizer(max_history)
@@ -180,8 +180,14 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
 
         self.config = train_utils.update_similarity_type(self.config)
 
-        if self.config[EVAL_NUM_EPOCHS] < 1:
+        if self.config[EVAL_NUM_EPOCHS] == -1:
+            # magic value -1 is used to set evaluation to number of epochs
             self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
+        elif self.config[EVAL_NUM_EPOCHS] < 1:
+            raise ValueError(
+                f"'{EVAL_NUM_EXAMPLES}' is set to '{self.config[EVAL_NUM_EPOCHS]}'. "
+                f"Only values > 1 are allowed for this configuration value."
+            )
 
     # data helpers
     # noinspection PyPep8Naming
@@ -364,7 +370,7 @@ def persist(self, path: Text):
             )
             return
 
-        file_name = "TED_policy"
+        file_name = "ted_policy"
         tf_model_file = os.path.join(path, f"{file_name}.tf_model")
 
         rasa.utils.io.create_directory_for_file(tf_model_file)
@@ -527,7 +533,7 @@ def _prepare_layers(self) -> None:
             self.config[TRANSFORMER_SIZE],
             self.config[NUM_HEADS],
             self.config[TRANSFORMER_SIZE] * 4,
-            self.config[MAX_SEQ_LENGTH],
+            self.config[MAX_SEQUENCE_LENGTH],
             self.config[REGULARIZATION_CONSTANT],
             dropout_rate=self.config[DROPRATE_DIALOGUE],
             attention_dropout_rate=self.config[DROPRATE_ATTENTION],
@@ -538,13 +544,13 @@ def _prepare_layers(self) -> None:
             name=DIALOGUE + "_encoder",
         )
         self._tf_layers["embed.dialogue"] = layers.Embed(
-            self.config[EMBED_DIM],
+            self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             DIALOGUE,
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.label"] = layers.Embed(
-            self.config[EMBED_DIM],
+            self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             LABEL,
             self.config[SIMILARITY_TYPE],
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 935c8356f7ca..44c3154640a7 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -41,13 +41,13 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
+    MAX_SEQUENCE_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
     RANDOM_SEED,
     LEARNING_RATE,
-    DENSE_DIM,
+    DENSE_DIMENSION,
     RANKING_LENGTH,
     LOSS_TYPE,
     SIMILARITY_TYPE,
@@ -67,7 +67,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     BILOU_FLAG,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
@@ -123,7 +123,7 @@ class DIETClassifier(EntityExtractor):
         # max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
         # max sequence length
-        MAX_SEQ_LENGTH: 256,
+        MAX_SEQUENCE_LENGTH: 256,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # training parameters
@@ -140,9 +140,9 @@ class DIETClassifier(EntityExtractor):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512, LABEL: 20},
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
         # dimension size of embedding vectors
-        EMBED_DIM: 20,
+        EMBEDDING_DIMENSION: 20,
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect actions
@@ -209,9 +209,14 @@ def _check_config_parameters(self) -> None:
             )
 
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
-            v1 = next(iter(self.component_config[HIDDEN_LAYERS_SIZES].values()))
+            first_hidden_layer_size = next(
+                iter(self.component_config[HIDDEN_LAYERS_SIZES].values())
+            )
             if any(
-                v != v1 for v in self.component_config[HIDDEN_LAYERS_SIZES].values()
+                current_hidden_layer_size != first_hidden_layer_size
+                for current_hidden_layer_size in self.component_config[
+                    HIDDEN_LAYERS_SIZES
+                ].values()
             ):
                 raise ValueError(
                     f"If hidden layer weights are shared, "
@@ -222,8 +227,15 @@ def _check_config_parameters(self) -> None:
             self.component_config
         )
 
-        if self.component_config[EVAL_NUM_EPOCHS] < 1:
+        if self.component_config[EVAL_NUM_EPOCHS] == -1:
+            # magic value -1 is used to set evaluation to number of epochs
             self.component_config[EVAL_NUM_EPOCHS] = self.component_config[EPOCHS]
+        elif self.component_config[EVAL_NUM_EPOCHS] < 1:
+            raise ValueError(
+                f"'{EVAL_NUM_EXAMPLES}' is set to "
+                f"'{self.component_config[EVAL_NUM_EPOCHS]}'. "
+                f"Only values > 1 are allowed for this configuration value."
+            )
 
     # package safety checks
     @classmethod
@@ -371,6 +383,8 @@ def _extract_features(
         return sparse_features, dense_features
 
     def check_input_dimension_consistency(self, model_data: RasaModelData):
+        """Checks if text features and label features have the same dimensionality if
+        hidden layers are shared."""
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
             num_text_features = model_data.get_feature_dimension("text_features")
             num_label_features = model_data.get_feature_dimension("label_features")
@@ -1043,7 +1057,7 @@ def _prepare_input_layers(self, name: Text) -> None:
             self.data_signature[f"{name}_features"],
             name,
             self.config[REGULARIZATION_CONSTANT],
-            self.config[DENSE_DIM][name],
+            self.config[DENSE_DIMENSION][name],
         )
         self._tf_layers[f"ffnn.{name}"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][name],
@@ -1061,7 +1075,7 @@ def _prepare_sequence_layers(self, name: Text) -> None:
                 self.config[TRANSFORMER_SIZE],
                 self.config[NUM_HEADS],
                 self.config[TRANSFORMER_SIZE] * 4,
-                self.config[MAX_SEQ_LENGTH],
+                self.config[MAX_SEQUENCE_LENGTH],
                 self.config[REGULARIZATION_CONSTANT],
                 dropout_rate=self.config[DROPRATE],
                 attention_dropout_rate=self.config[DROPRATE_ATTENTION],
@@ -1078,13 +1092,13 @@ def _prepare_sequence_layers(self, name: Text) -> None:
     def _prepare_mask_lm_layers(self, name: Text) -> None:
         self._tf_layers[f"{name}_input_mask"] = layers.InputMask()
         self._tf_layers[f"embed.{name}_lm_mask"] = layers.Embed(
-            self.config[EMBED_DIM],
+            self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             f"{name}_lm_mask",
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers[f"embed.{name}_golden_token"] = layers.Embed(
-            self.config[EMBED_DIM],
+            self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             f"{name}_golden_token",
             self.config[SIMILARITY_TYPE],
@@ -1103,13 +1117,13 @@ def _prepare_mask_lm_layers(self, name: Text) -> None:
 
     def _prepare_label_classification_layers(self) -> None:
         self._tf_layers["embed.text"] = layers.Embed(
-            self.config[EMBED_DIM],
+            self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             "text",
             self.config[SIMILARITY_TYPE],
         )
         self._tf_layers["embed.label"] = layers.Embed(
-            self.config[EMBED_DIM],
+            self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             "label",
             self.config[SIMILARITY_TYPE],
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 90f9b5881c75..aa674a366a73 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -15,7 +15,7 @@
     EPOCHS,
     RANDOM_SEED,
     LEARNING_RATE,
-    DENSE_DIM,
+    DENSE_DIMENSION,
     RANKING_LENGTH,
     LOSS_TYPE,
     SIMILARITY_TYPE,
@@ -33,7 +33,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     BILOU_FLAG,
 )
 from rasa.utils.common import raise_warning
@@ -71,9 +71,9 @@ class EmbeddingIntentClassifier(DIETClassifier):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512, LABEL: 20},
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
         # dimension size of embedding vectors
-        EMBED_DIM: 20,
+        EMBEDDING_DIMENSION: 20,
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect actions
@@ -139,5 +139,5 @@ def __init__(
             "'EmbeddingIntentClassifier' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
-            docs=DOCS_URL_COMPONENTS,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
         )
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index cfc976d0966a..8faf484cce41 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -67,7 +67,7 @@ def __init__(
             "'SklearnIntentClassifier' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
-            docs=DOCS_URL_COMPONENTS,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
         )
 
     @classmethod
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index a8b1e4155903..7887a582888c 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -124,7 +124,7 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
                     "the components you want to use directly to your configuration "
                     "file.",
                     FutureWarning,
-                    docs=DOCS_URL_PIPELINE,
+                    docs="https://rasa.com/docs/rasa/migration-guide/",
                 )
 
                 # replaces the template with the actual components
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index f3da1e3836ac..b98b5daeb18e 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -106,7 +106,7 @@ def __init__(
             "'CRFEntityExtractor' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
-            docs=DOCS_URL_COMPONENTS,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
         )
 
     def _validate_configuration(self) -> None:
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index cd6fbf8dcf5f..1bbab2d66af9 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -38,7 +38,7 @@ class MitieFeaturizer(Featurizer):
         "pooling": "mean"
     }
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super().__init__(component_config)
 
         self.pooling_operation = self.component_config["pooling"]
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
index a6e691a7c8cb..39a9eaa7a7af 100644
--- a/rasa/nlu/selectors/diet_selector.py
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -15,13 +15,13 @@
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
+    MAX_SEQUENCE_LENGTH,
     BATCH_SIZES,
     BATCH_STRATEGY,
     EPOCHS,
     RANDOM_SEED,
     LEARNING_RATE,
-    DENSE_DIM,
+    DENSE_DIMENSION,
     RANKING_LENGTH,
     LOSS_TYPE,
     SIMILARITY_TYPE,
@@ -38,7 +38,7 @@
     NEG_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     BILOU_FLAG,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
@@ -104,7 +104,7 @@ class DIETSelector(DIETClassifier):
         # number of attention heads in transformer
         NUM_HEADS: 4,
         # max sequence length if pos_encoding='emb'
-        MAX_SEQ_LENGTH: 256,
+        MAX_SEQUENCE_LENGTH: 256,
         # training parameters
         # initial and final batch sizes - batch size will be
         # linearly increased for each epoch
@@ -119,9 +119,9 @@ class DIETSelector(DIETClassifier):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512, LABEL: 512},
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 512},
         # dimension size of embedding vectors
-        EMBED_DIM: 20,
+        EMBEDDING_DIMENSION: 20,
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect actions
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 5df777521d79..8a231f68c3a9 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -15,7 +15,7 @@
     EPOCHS,
     RANDOM_SEED,
     LEARNING_RATE,
-    DENSE_DIM,
+    DENSE_DIMENSION,
     RANKING_LENGTH,
     LOSS_TYPE,
     SIMILARITY_TYPE,
@@ -33,7 +33,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     BILOU_FLAG,
 )
 from rasa.nlu.constants import (
@@ -102,9 +102,9 @@ class ResponseSelector(DIETClassifier):
         LEARNING_RATE: 0.001,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        DENSE_DIM: {TEXT: 512, LABEL: 512},
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 512},
         # dimension size of embedding vectors
-        EMBED_DIM: 20,
+        EMBEDDING_DIMENSION: 20,
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect actions
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index bbeef0c6539a..a9247b9b3f8b 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -12,7 +12,7 @@
 VALUE_RELATIVE_ATTENTION = "use_value_relative_attention"
 MAX_RELATIVE_POSITION = "max_relative_position"
 
-MAX_SEQ_LENGTH = "maximum_sequence_length"
+MAX_SEQUENCE_LENGTH = "maximum_sequence_length"
 
 BATCH_SIZES = "batch_size"
 BATCH_STRATEGY = "batch_strategy"
@@ -20,8 +20,8 @@
 RANDOM_SEED = "random_seed"
 LEARNING_RATE = "learning_rate"
 
-DENSE_DIM = "dense_dimension"
-EMBED_DIM = "embedding_dimension"
+DENSE_DIMENSION = "dense_dimension"
+EMBEDDING_DIMENSION = "embedding_dimension"
 
 SIMILARITY_TYPE = "similarity_type"
 LOSS_TYPE = "loss_type"
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 0d68f6a9744d..3fcca407c405 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -13,8 +13,8 @@
     HIDDEN_LAYERS_SIZES,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
-    MAX_SEQ_LENGTH,
-    DENSE_DIM,
+    MAX_SEQUENCE_LENGTH,
+    DENSE_DIMENSION,
     LOSS_TYPE,
     SIMILARITY_TYPE,
     NUM_NEG,
@@ -24,7 +24,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
-    EMBED_DIM,
+    EMBEDDING_DIMENSION,
     DROPRATE_DIALOGUE,
     DROPRATE_LABEL,
     NEG_MARGIN_SCALE,
@@ -155,9 +155,9 @@ def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
         "num_transformer_layers", NUM_TRANSFORMER_LAYERS, config
     )
     config = _replace_deprecated_option("num_heads", NUM_HEADS, config)
-    config = _replace_deprecated_option("max_seq_length", MAX_SEQ_LENGTH, config)
-    config = _replace_deprecated_option("dense_dim", DENSE_DIM, config)
-    config = _replace_deprecated_option("embed_dim", EMBED_DIM, config)
+    config = _replace_deprecated_option("max_seq_length", MAX_SEQUENCE_LENGTH, config)
+    config = _replace_deprecated_option("dense_dim", DENSE_DIMENSION, config)
+    config = _replace_deprecated_option("embed_dim", EMBEDDING_DIMENSION, config)
     config = _replace_deprecated_option("num_neg", NUM_NEG, config)
     config = _replace_deprecated_option("mu_pos", MAX_POS_SIM, config)
     config = _replace_deprecated_option("mu_neg", MAX_NEG_SIM, config)

From a06cff4b17f3e7a37d383962ca56cb51f33c4ff8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:36:13 +0100
Subject: [PATCH 497/633] fix imports in tests/utitlities.py

---
 tests/utilities.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/utilities.py b/tests/utilities.py
index f776052ac9be..118af373919d 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -2,11 +2,11 @@
 from yarl import URL
 
 import rasa.utils.io as io_utils
-from nlu.classifiers.diet_classifier import DIETClassifier
-from nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from nlu.selectors.diet_selector import DIETSelector
-from nlu.selectors.response_selector import ResponseSelector
-from utils.tensorflow.constants import EPOCHS
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.selectors.diet_selector import DIETSelector
+from rasa.nlu.selectors.response_selector import ResponseSelector
+from rasa.utils.tensorflow.constants import EPOCHS
 
 
 def latest_request(mocked, request_type, path):

From 1a8b820c44a6399fc5450949fdd55af9c4587329 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:53:41 +0100
Subject: [PATCH 498/633] use json.dump and json.load in lexical syntactic
 featurizer

---
 .../sparse_featurizer/lexical_syntactic_featurizer.py        | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 2b71bd6a2459..ca6cf920a406 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -1,3 +1,4 @@
+import json
 import logging
 from collections import defaultdict, OrderedDict
 
@@ -271,7 +272,7 @@ def load(
         with open(
             os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "rb"
         ) as f:
-            feature_to_idx_dict = pickle.load(f)
+            feature_to_idx_dict = json.load(f)
 
         return LexicalSyntacticFeaturizer(meta, feature_to_idx_dict=feature_to_idx_dict)
 
@@ -282,6 +283,6 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
         with open(
             os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "wb"
         ) as f:
-            pickle.dump(self.feature_to_idx_dict, f)
+            json.dump(self.feature_to_idx_dict, f)
 
         return {"file": file_name}

From 140dba99582f775c1cfe3b8b83aba52303705687 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:54:29 +0100
Subject: [PATCH 499/633] retrieval_intent is now a constant

---
 rasa/nlu/selectors/diet_selector.py     | 5 +++--
 rasa/nlu/selectors/response_selector.py | 5 +++--
 rasa/utils/tensorflow/constants.py      | 2 ++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
index 39a9eaa7a7af..6340de362263 100644
--- a/rasa/nlu/selectors/diet_selector.py
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -46,6 +46,7 @@
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
+    RETRIEVAL_INTENT,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -171,7 +172,7 @@ class DIETSelector(DIETClassifier):
         MAX_RELATIVE_POSITION: None,
         # selector config
         # name of the intent for which this response selector is to be trained
-        "retrieval_intent": None,
+        RETRIEVAL_INTENT: None,
     }
     # end default properties (DOC MARKER - don't remove)
 
@@ -208,7 +209,7 @@ def model_class():
         return DIET2DIET
 
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
-        self.retrieval_intent = config["retrieval_intent"]
+        self.retrieval_intent = config[RETRIEVAL_INTENT]
         if not self.retrieval_intent:
             # retrieval intent was left to its default value
             logger.info(
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 8a231f68c3a9..a27e86e6cfa1 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -35,6 +35,7 @@
     MAX_POS_SIM,
     EMBEDDING_DIMENSION,
     BILOU_FLAG,
+    RETRIEVAL_INTENT,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -141,7 +142,7 @@ class ResponseSelector(DIETClassifier):
         EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
         # selector config
         # name of the intent for which this response selector is to be trained
-        "retrieval_intent": None,
+        RETRIEVAL_INTENT: None,
     }
 
     def __init__(
@@ -182,7 +183,7 @@ def label_key(self) -> Text:
         return "label_ids"
 
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
-        self.retrieval_intent = config["retrieval_intent"]
+        self.retrieval_intent = config[RETRIEVAL_INTENT]
         if not self.retrieval_intent:
             # retrieval intent was left to its default value
             logger.info(
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index a9247b9b3f8b..539e36ad1c17 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -50,3 +50,5 @@
 RANKING_LENGTH = "ranking_length"
 
 BILOU_FLAG = "BILOU_flag"
+
+RETRIEVAL_INTENT = "retrieval_intent"

From 56023040885e283c3adf311bf5328b77aa9b76d5 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 14:07:33 +0100
Subject: [PATCH 500/633] made transformers lib optional and removed a few
 other deps

---
 alt_requirements/requirements_full.txt        |  3 +++
 ...nts_pretrained_embeddings_transformers.txt |  4 +++
 docs/nlu/components.rst                       |  3 +++
 .../nlu/utils/hugging_face/hf_transformers.py | 25 +++++++++++++------
 requirements.txt                              |  4 ---
 setup.py                                      |  3 ++-
 6 files changed, 29 insertions(+), 13 deletions(-)
 create mode 100644 alt_requirements/requirements_pretrained_embeddings_transformers.txt

diff --git a/alt_requirements/requirements_full.txt b/alt_requirements/requirements_full.txt
index e033e2544462..b97e60736d04 100644
--- a/alt_requirements/requirements_full.txt
+++ b/alt_requirements/requirements_full.txt
@@ -10,4 +10,7 @@
 # ConveRT Requirements
 -r requirements_pretrained_embeddings_convert.txt
 
+# Transformers Requirements
+-r requirements_pretrained_embeddings_transformers.txt
+
 jieba==0.39
diff --git a/alt_requirements/requirements_pretrained_embeddings_transformers.txt b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
new file mode 100644
index 000000000000..a513258cbe48
--- /dev/null
+++ b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
@@ -0,0 +1,4 @@
+# Minimum Install Requirements
+-r ../requirements.txt
+
+transformers==2.3.0
\ No newline at end of file
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 5a3386198d7c..e35cfe5860aa 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -103,6 +103,9 @@ HFTransformersNLP
     featurization to compute sequence and sentence level representations for each example in the training data.
     Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
     component for downstream NLU models.
+
+    .. note:: To use ``HFTransformersNLP`` component, install Rasa OS with ``pip install rasa[transformers]``.
+
 :Configuration:
     .. code-block:: yaml
 
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 70a577ccfec0..554e4b0b1160 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -9,14 +9,6 @@
 import rasa.utils.train_utils as train_utils
 import numpy as np
 
-from rasa.nlu.utils.hugging_face.registry import (
-    model_class_dict,
-    model_tokenizer_dict,
-    model_weights_defaults,
-    model_special_tokens_pre_processors,
-    model_embeddings_post_processors,
-    model_tokens_cleaners,
-)
 from rasa.nlu.constants import (
     TEXT,
     LANGUAGE_MODEL_DOCS,
@@ -60,6 +52,12 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
     def _load_model(self) -> None:
         """Try loading the model"""
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_class_dict,
+            model_weights_defaults,
+            model_tokenizer_dict,
+        )
+
         self.model_name = self.component_config["model_name"]
 
         if self.model_name not in model_class_dict:
@@ -106,6 +104,10 @@ def _add_lm_specific_special_tokens(
         self, token_ids: List[List[int]]
     ) -> List[List[int]]:
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_special_tokens_pre_processors,
+        )
+
         augmented_tokens = [
             model_special_tokens_pre_processors[self.model_name](example_token_ids)
             for example_token_ids in token_ids
@@ -113,12 +115,19 @@ def _add_lm_specific_special_tokens(
         return augmented_tokens
 
     def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
+
+        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners
+
         return model_tokens_cleaners[self.model_name](token_strings)
 
     def _post_process_sequence_embeddings(
         self, sequence_embeddings: np.ndarray
     ) -> Tuple[np.ndarray, np.ndarray]:
 
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_embeddings_post_processors,
+        )
+
         sentence_embeddings = []
         post_processed_sequence_embeddings = []
 
diff --git a/requirements.txt b/requirements.txt
index 42947178522f..452957aea85f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,6 @@ absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
 # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
 setuptools >= 41.0.0
-tensor2tensor==1.14.0
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.4.0
@@ -62,7 +61,4 @@ PyJWT==1.7.1
 python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
-tensorflow_hub==0.7.0
 tensorflow-addons==0.7.0
-tensorflow-probability==0.7.0
-transformers==2.3.0
diff --git a/setup.py b/setup.py
index f4684ea9eb61..ba95e87f8fe9 100644
--- a/setup.py
+++ b/setup.py
@@ -85,13 +85,14 @@
     "SQLAlchemy~=1.3.0",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
-    "transformers~=2.3.0",
+    "tensorflow-addons==0.7.0",
 ]
 
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
     "convert": ["tensorflow_text~=2.1.0rc0", "tensorflow_hub~=0.7.0"],
+    "transformers": ["transformers~=2.3.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],

From 5621809f3a8db9dd7489f0799a6b3310a051bc74 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 14:15:31 +0100
Subject: [PATCH 501/633] Update docs/nlu/components.rst

Co-Authored-By: Tanja <tabergma@gmail.com>
---
 docs/nlu/components.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index e35cfe5860aa..570ce017f3b1 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -104,7 +104,7 @@ HFTransformersNLP
     Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
     component for downstream NLU models.
 
-    .. note:: To use ``HFTransformersNLP`` component, install Rasa OS with ``pip install rasa[transformers]``.
+     .. note:: To use ``HFTransformersNLP`` component, install Rasa Open Source with ``pip install rasa[transformers]``.
 
 :Configuration:
     .. code-block:: yaml
@@ -1730,4 +1730,4 @@ DIETClassifier
             # BILOU_flag determines whether to use BILOU tagging or not.
             # More rigorous however requires more examples per entity
             # rule of thumb: use only if more than 100 egs. per entity
-            "BILOU_flag": True
\ No newline at end of file
+            "BILOU_flag": True

From 41266a002d4c7e949e38fa30673ab998c9c37319 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 14:59:50 +0100
Subject: [PATCH 502/633] renaming functions

---
 rasa/nlu/classifiers/diet_classifier.py       |  6 ++---
 rasa/nlu/tokenizers/lm_tokenizer.py           |  4 ++--
 .../nlu/utils/hugging_face/hf_transformers.py | 24 +++++++++++--------
 rasa/utils/tensorflow/model_data.py           | 12 ++++++----
 tests/utils/test_model_data.py                |  6 ++---
 5 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 44c3154640a7..bff2acdf0922 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -386,8 +386,8 @@ def check_input_dimension_consistency(self, model_data: RasaModelData):
         """Checks if text features and label features have the same dimensionality if
         hidden layers are shared."""
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
-            num_text_features = model_data.get_feature_dimension("text_features")
-            num_label_features = model_data.get_feature_dimension("label_features")
+            num_text_features = model_data.feature_dimension("text_features")
+            num_label_features = model_data.feature_dimension("label_features")
 
             if num_text_features != num_label_features:
                 raise ValueError(
@@ -541,7 +541,7 @@ def _create_model_data(
         model_data = RasaModelData(label_key=self.label_key)
         model_data.add_features("text_features", [X_sparse, X_dense])
         model_data.add_features("label_features", [Y_sparse, Y_dense])
-        if label_attribute and model_data.feature_not_exists("label_features"):
+        if label_attribute and model_data.does_feature_exist("label_features"):
             # no label features are present, get default features from _label_data
             model_data.add_features(
                 "label_features", self._use_default_label_features(label_ids)
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index 4c5faeb89dfe..d6210776ce8c 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -16,8 +16,8 @@
 class LanguageModelTokenizer(Tokenizer):
     """Tokenizer using transformer based language models.
 
-        Uses the output of HFTransformersNLP component to set the tokens
-        for dense featurizable attributes of each message object.
+    Uses the output of HFTransformersNLP component to set the tokens
+    for dense featurizable attributes of each message object.
     """
 
     provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 70a577ccfec0..052d09389f92 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -33,10 +33,10 @@
 class HFTransformersNLP(Component):
     """Utility Component for interfacing between Transformers library.
 
-        The transformers(https://github.com/huggingface/transformers) library
-        is used to load pre-trained language models like BERT, GPT-2, etc.
-        The component also tokenizes and featurizes dense featurizable attributes of each
-        message.
+    The transformers(https://github.com/huggingface/transformers) library
+    is used to load pre-trained language models like BERT, GPT-2, etc.
+    The component also tokenizes and featurizes dense featurizable attributes of each
+    message.
     """
 
     provides = [
@@ -64,7 +64,8 @@ def _load_model(self) -> None:
 
         if self.model_name not in model_class_dict:
             raise KeyError(
-                f"'{self.model_name}' not a valid model name. Choose from {str(list(model_class_dict.keys()))}or create"
+                f"'{self.model_name}' not a valid model name. Choose from "
+                f"{str(list(model_class_dict.keys()))}or create"
                 f"a new class inheriting from this class to support your model."
             )
 
@@ -72,7 +73,8 @@ def _load_model(self) -> None:
 
         if not self.model_weights:
             logger.info(
-                f"Model weights not specified. Will choose default model weights: {model_weights_defaults[self.model_name]}"
+                f"Model weights not specified. Will choose default model weights: "
+                f"{model_weights_defaults[self.model_name]}"
             )
             self.model_weights = model_weights_defaults[self.model_name]
 
@@ -84,10 +86,12 @@ def _load_model(self) -> None:
             self.model_weights
         )
 
-        # Use a universal pad token since all transformer architectures do not have a consistent token.
-        # Instead of pad_token_id we use unk_token_id because pad_token_id is not set for all architectures.
-        # We can't add a new token as well since vocabulary resizing is not yet supported for TF classes.
-        # Also, this does not hurt the model predictions since we use an attention mask while feeding input.
+        # Use a universal pad token since all transformer architectures do not have a
+        # consistent token. Instead of pad_token_id we use unk_token_id because
+        # pad_token_id is not set for all architectures. We can't add a new token as
+        # well since vocabulary resizing is not yet supported for TF classes.
+        # Also, this does not hurt the model predictions since we use an attention mask
+        # while feeding input.
         self.pad_token_id = self.tokenizer.unk_token_id
 
     @classmethod
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 5a8cc16df705..2d55489a2c3c 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -25,7 +25,7 @@ def __init__(self, label_key: Optional[Text] = None, data: Data = None):
         self.data = data or {}
         self.label_key = label_key
         # will be updated when features are added
-        self.num_examples = self.get_number_of_examples()
+        self.num_examples = self.number_of_examples()
 
     def get(self, key: Text) -> List[np.ndarray]:
         if key in self.data:
@@ -42,13 +42,15 @@ def values(self):
     def keys(self):
         return self.data.keys()
 
-    def feature_not_exists(self, key: Text) -> bool:
+    def does_feature_exist(self, key: Text) -> bool:
+        """Check if feature key is present and features are available."""
         return key not in self.data or not self.data[key]
 
     def is_empty(self):
+        """Checks if data is set."""
         return not self.data
 
-    def get_number_of_examples(self) -> int:
+    def number_of_examples(self) -> int:
         """Obtain number of examples in data.
 
         Raise a ValueError if number of examples differ for different data in
@@ -68,7 +70,7 @@ def get_number_of_examples(self) -> int:
 
         return example_lengths[0]
 
-    def get_feature_dimension(self, key: Text) -> int:
+    def feature_dimension(self, key: Text) -> int:
         """Get the feature dimension of the given key."""
 
         number_of_features = 0
@@ -133,7 +135,7 @@ def add_features(self, key: Text, features: List[np.ndarray]):
             del self.data[key]
 
         # update number of examples
-        self.num_examples = self.get_number_of_examples()
+        self.num_examples = self.number_of_examples()
 
     def add_mask(self, key: Text, from_key: Text):
         """Calculate mask for given key and put it under specified key."""
diff --git a/tests/utils/test_model_data.py b/tests/utils/test_model_data.py
index 2d5a2d5b7d9f..c34643dec5b0 100644
--- a/tests/utils/test_model_data.py
+++ b/tests/utils/test_model_data.py
@@ -141,13 +141,13 @@ def test_session_data_for_ids(model_data: RasaModelData):
 
 
 def test_get_number_of_examples(model_data: RasaModelData):
-    assert model_data.get_number_of_examples() == 5
+    assert model_data.number_of_examples() == 5
 
 
 def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
     model_data.data["dense"] = [np.random.randint(5, size=(2, 10))]
     with pytest.raises(ValueError):
-        model_data.get_number_of_examples()
+        model_data.number_of_examples()
 
 
 def test_gen_batch(model_data: RasaModelData):
@@ -184,6 +184,6 @@ def test_not_balance_model_data(model_data: RasaModelData):
 
 
 def test_get_num_of_features(model_data: RasaModelData):
-    num_features = model_data.get_feature_dimension("text_features")
+    num_features = model_data.feature_dimension("text_features")
 
     assert num_features == 24

From 3cfd2430754b3fb3916f91c62bc750ebb976658d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 15:21:39 +0100
Subject: [PATCH 503/633] droprate -> drop rate

---
 rasa/core/policies/embedding_policy.py        | 28 +++++++-------
 rasa/core/policies/ted_policy.py              | 38 +++++++++----------
 rasa/nlu/classifiers/diet_classifier.py       | 24 ++++++------
 .../embedding_intent_classifier.py            |  8 ++--
 rasa/nlu/selectors/diet_selector.py           | 12 +++---
 rasa/nlu/selectors/response_selector.py       |  8 ++--
 rasa/utils/tensorflow/constants.py            | 10 ++---
 rasa/utils/train_utils.py                     | 14 ++++---
 8 files changed, 72 insertions(+), 70 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 184725727745..95125ff035f8 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -22,16 +22,16 @@
     NUM_NEG,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    NEG_MARGIN_SCALE,
+    NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
     EMBEDDING_DIMENSION,
-    DROPRATE_DIALOGUE,
-    DROPRATE_LABEL,
-    DROPRATE_ATTENTION,
+    DROP_RATE_DIALOGUE,
+    DROP_RATE_LABEL,
+    DROP_RATE_ATTENTION,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -43,15 +43,15 @@
 
 
 class EmbeddingPolicy(TEDPolicy):
-    """Transformer Embedding Dialogue Policy (TEDP)
+    """Transformer Embedding Dialogue (TED) Policy.
 
-    The policy used in our paper https://arxiv.org/abs/1910.00486
+    The policy used in our paper https://arxiv.org/abs/1910.00486.
     """
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # a list of hidden layers sizes before dialogue and action embed layers
+        # a list of hidden layers sizes before dialogue and action embedding layers
         # number of hidden layers is equal to the length of this list
         HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
         # number of units in transformer
@@ -67,7 +67,7 @@ class EmbeddingPolicy(TEDPolicy):
         # batch size will be linearly increased for each epoch
         BATCH_SIZES: [8, 32],
         # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        BATCH_STRATEGY: "balanced",  # 'sequence' or 'balanced'
         # number of epochs
         EPOCHS: 1,
         # set random seed to any int to get reproducible results
@@ -78,9 +78,9 @@ class EmbeddingPolicy(TEDPolicy):
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect labels
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
+        SIMILARITY_TYPE: "auto",  # 'auto' or 'cosine' or 'inner'
         # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
+        LOSS_TYPE: "softmax",  # 'softmax' or 'margin'
         # number of top actions to normalize scores for softmax loss_type
         # set to 0 to turn off normalization
         RANKING_LENGTH: 10,
@@ -99,13 +99,13 @@ class EmbeddingPolicy(TEDPolicy):
         REGULARIZATION_CONSTANT: 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different labels
-        NEG_MARGIN_SCALE: 0.8,
+        NEGATIVE_MARGIN_SCALE: 0.8,
         # dropout rate for dial nn
-        DROPRATE_DIALOGUE: 0.1,
+        DROP_RATE_DIALOGUE: 0.1,
         # dropout rate for bot nn
-        DROPRATE_LABEL: 0.0,
+        DROP_RATE_LABEL: 0.0,
         # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
+        DROP_RATE_ATTENTION: 0,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index f0a2c13087af..4a84cd69f70a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -42,16 +42,16 @@
     NUM_NEG,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    NEG_MARGIN_SCALE,
+    NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_NEG_SIM,
     MAX_NEG_SIM,
     MAX_POS_SIM,
     EMBEDDING_DIMENSION,
-    DROPRATE_DIALOGUE,
-    DROPRATE_LABEL,
-    DROPRATE_ATTENTION,
+    DROP_RATE_DIALOGUE,
+    DROP_RATE_LABEL,
+    DROP_RATE_ATTENTION,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -62,9 +62,9 @@
 
 
 class TEDPolicy(Policy):
-    """Transformer Embedding Dialogue Policy (TEDP)
+    """Transformer Embedding Dialogue (TED) Policy.
 
-    The policy used in our paper https://arxiv.org/abs/1910.00486
+    The policy used in our paper https://arxiv.org/abs/1910.00486.
     """
 
     SUPPORTS_ONLINE_TRAINING = True
@@ -72,7 +72,7 @@ class TEDPolicy(Policy):
     # default properties (DOC MARKER - don't remove)
     defaults = {
         # nn architecture
-        # a list of hidden layers sizes before dialogue and action embed layers
+        # a list of hidden layers sizes before dialogue and action embedding layers
         # number of hidden layers is equal to the length of this list
         HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
         # number of units in transformer
@@ -94,7 +94,7 @@ class TEDPolicy(Policy):
         # batch size will be linearly increased for each epoch
         BATCH_SIZES: [8, 32],
         # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
+        BATCH_STRATEGY: "balanced",  # 'sequence' or 'balanced'
         # number of epochs
         EPOCHS: 1,
         # set random seed to any int to get reproducible results
@@ -105,9 +105,9 @@ class TEDPolicy(Policy):
         # the type of the similarity
         NUM_NEG: 20,
         # flag if minimize only maximum similarity over incorrect labels
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
+        SIMILARITY_TYPE: "auto",  # 'auto' or 'cosine' or 'inner'
         # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
+        LOSS_TYPE: "softmax",  # 'softmax' or 'margin'
         # number of top actions to normalize scores for softmax loss_type
         # set to 0 to turn off normalization
         RANKING_LENGTH: 10,
@@ -126,13 +126,13 @@ class TEDPolicy(Policy):
         REGULARIZATION_CONSTANT: 0.001,
         # the scale of how important is to minimize the maximum similarity
         # between embeddings of different labels
-        NEG_MARGIN_SCALE: 0.8,
+        NEGATIVE_MARGIN_SCALE: 0.8,
         # dropout rate for dial nn
-        DROPRATE_DIALOGUE: 0.1,
+        DROP_RATE_DIALOGUE: 0.1,
         # dropout rate for bot nn
-        DROPRATE_LABEL: 0.0,
+        DROP_RATE_LABEL: 0.0,
         # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
+        DROP_RATE_ATTENTION: 0,
         # visualization of accuracy
         # how often calculate validation accuracy
         EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
@@ -511,20 +511,20 @@ def _prepare_layers(self) -> None:
             self.config[MAX_POS_SIM],
             self.config[MAX_NEG_SIM],
             self.config[USE_MAX_NEG_SIM],
-            self.config[NEG_MARGIN_SCALE],
+            self.config[NEGATIVE_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
         )
         self._tf_layers["ffnn.dialogue"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][DIALOGUE],
-            self.config[DROPRATE_DIALOGUE],
+            self.config[DROP_RATE_DIALOGUE],
             self.config[REGULARIZATION_CONSTANT],
             layer_name_suffix=DIALOGUE,
         )
         self._tf_layers["ffnn.label"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][LABEL],
-            self.config[DROPRATE_LABEL],
+            self.config[DROP_RATE_LABEL],
             self.config[REGULARIZATION_CONSTANT],
             layer_name_suffix=LABEL,
         )
@@ -535,8 +535,8 @@ def _prepare_layers(self) -> None:
             self.config[TRANSFORMER_SIZE] * 4,
             self.config[MAX_SEQUENCE_LENGTH],
             self.config[REGULARIZATION_CONSTANT],
-            dropout_rate=self.config[DROPRATE_DIALOGUE],
-            attention_dropout_rate=self.config[DROPRATE_ATTENTION],
+            dropout_rate=self.config[DROP_RATE_DIALOGUE],
+            attention_dropout_rate=self.config[DROP_RATE_ATTENTION],
             unidirectional=True,
             use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
             use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index bff2acdf0922..c1307667042d 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -59,9 +59,9 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
-    DROPRATE,
-    DROPRATE_ATTENTION,
-    NEG_MARGIN_SCALE,
+    DROP_RATE,
+    DROP_RATE_ATTENTION,
+    NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_NEG_SIM,
@@ -166,11 +166,11 @@ class DIETClassifier(EntityExtractor):
         REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different labels
-        NEG_MARGIN_SCALE: 0.8,
+        NEGATIVE_MARGIN_SCALE: 0.8,
         # dropout rate for encoder
-        DROPRATE: 0.2,
+        DROP_RATE: 0.2,
         # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
+        DROP_RATE_ATTENTION: 0,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
         # visualization of accuracy
@@ -1051,7 +1051,7 @@ def _prepare_sparse_dense_layers(
 
     def _prepare_input_layers(self, name: Text) -> None:
         self._tf_layers[f"sparse_dropout.{name}"] = layers.SparseDropout(
-            rate=self.config[DROPRATE]
+            rate=self.config[DROP_RATE]
         )
         self._prepare_sparse_dense_layers(
             self.data_signature[f"{name}_features"],
@@ -1061,7 +1061,7 @@ def _prepare_input_layers(self, name: Text) -> None:
         )
         self._tf_layers[f"ffnn.{name}"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][name],
-            self.config[DROPRATE],
+            self.config[DROP_RATE],
             self.config[REGULARIZATION_CONSTANT],
             name,
         )
@@ -1077,8 +1077,8 @@ def _prepare_sequence_layers(self, name: Text) -> None:
                 self.config[TRANSFORMER_SIZE] * 4,
                 self.config[MAX_SEQUENCE_LENGTH],
                 self.config[REGULARIZATION_CONSTANT],
-                dropout_rate=self.config[DROPRATE],
-                attention_dropout_rate=self.config[DROPRATE_ATTENTION],
+                dropout_rate=self.config[DROP_RATE],
+                attention_dropout_rate=self.config[DROP_RATE_ATTENTION],
                 unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
                 use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
                 use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
@@ -1109,7 +1109,7 @@ def _prepare_mask_lm_layers(self, name: Text) -> None:
             self.config[MAX_POS_SIM],
             self.config[MAX_NEG_SIM],
             self.config[USE_MAX_NEG_SIM],
-            self.config[NEG_MARGIN_SCALE],
+            self.config[NEGATIVE_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
@@ -1134,7 +1134,7 @@ def _prepare_label_classification_layers(self) -> None:
             self.config[MAX_POS_SIM],
             self.config[MAX_NEG_SIM],
             self.config[USE_MAX_NEG_SIM],
-            self.config[NEG_MARGIN_SCALE],
+            self.config[NEGATIVE_MARGIN_SCALE],
             self.config[SCALE_LOSS],
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index aa674a366a73..7c9a427804c8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -26,8 +26,8 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    DROPRATE,
-    NEG_MARGIN_SCALE,
+    DROP_RATE,
+    NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_NEG_SIM,
@@ -97,9 +97,9 @@ class EmbeddingIntentClassifier(DIETClassifier):
         REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different labels
-        NEG_MARGIN_SCALE: 0.8,
+        NEGATIVE_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
-        DROPRATE: 0.2,
+        DROP_RATE: 0.2,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
diff --git a/rasa/nlu/selectors/diet_selector.py b/rasa/nlu/selectors/diet_selector.py
index 6340de362263..8239666e5f6a 100644
--- a/rasa/nlu/selectors/diet_selector.py
+++ b/rasa/nlu/selectors/diet_selector.py
@@ -33,9 +33,9 @@
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
     UNIDIRECTIONAL_ENCODER,
-    DROPRATE,
-    DROPRATE_ATTENTION,
-    NEG_MARGIN_SCALE,
+    DROP_RATE,
+    DROP_RATE_ATTENTION,
+    NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     EMBEDDING_DIMENSION,
@@ -147,11 +147,11 @@ class DIETSelector(DIETClassifier):
         REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different intent labels
-        NEG_MARGIN_SCALE: 0.8,
+        NEGATIVE_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
-        DROPRATE: 0.2,
+        DROP_RATE: 0.2,
         # dropout rate for attention
-        DROPRATE_ATTENTION: 0,
+        DROP_RATE_ATTENTION: 0,
         # use a unidirectional or bidirectional encoder
         UNIDIRECTIONAL_ENCODER: False,
         # if true apply dropout to sparse tensors
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index a27e86e6cfa1..86fd50bbdc20 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -26,8 +26,8 @@
     INTENT_CLASSIFICATION,
     EVAL_NUM_EXAMPLES,
     EVAL_NUM_EPOCHS,
-    DROPRATE,
-    NEG_MARGIN_SCALE,
+    DROP_RATE,
+    NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
     USE_MAX_NEG_SIM,
@@ -130,9 +130,9 @@ class ResponseSelector(DIETClassifier):
         REGULARIZATION_CONSTANT: 0.002,
         # the scale of how critical the algorithm should be of minimizing the
         # maximum similarity between embeddings of different intent labels
-        NEG_MARGIN_SCALE: 0.8,
+        NEGATIVE_MARGIN_SCALE: 0.8,
         # dropout rate for rnn
-        DROPRATE: 0.2,
+        DROP_RATE: 0.2,
         # if true apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
         # visualization of accuracy
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 539e36ad1c17..4d56a844f9aa 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -32,11 +32,11 @@
 
 SCALE_LOSS = "scale_loss"
 REGULARIZATION_CONSTANT = "regularization_constant"
-NEG_MARGIN_SCALE = "negative_margin_scale"
-DROPRATE = "droprate"
-DROPRATE_ATTENTION = "droprate_attention"
-DROPRATE_DIALOGUE = "droprate_dialogue"
-DROPRATE_LABEL = "droprate_label"
+NEGATIVE_MARGIN_SCALE = "negative_margin_scale"
+DROP_RATE = "drop_rate"
+DROP_RATE_ATTENTION = "drop_rate_attention"
+DROP_RATE_DIALOGUE = "drop_rate_dialogue"
+DROP_RATE_LABEL = "drop_rate_label"
 
 EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
 EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 3fcca407c405..d1af9eec2acd 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -25,9 +25,10 @@
     MAX_NEG_SIM,
     MAX_POS_SIM,
     EMBEDDING_DIMENSION,
-    DROPRATE_DIALOGUE,
-    DROPRATE_LABEL,
-    NEG_MARGIN_SCALE,
+    DROP_RATE_DIALOGUE,
+    DROP_RATE_LABEL,
+    NEGATIVE_MARGIN_SCALE,
+    DROP_RATE,
 )
 
 
@@ -143,8 +144,9 @@ def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
     config = _replace_deprecated_option(
         "hidden_layers_sizes_bot", [HIDDEN_LAYERS_SIZES, LABEL], config
     )
-    config = _replace_deprecated_option("droprate_a", DROPRATE_DIALOGUE, config)
-    config = _replace_deprecated_option("droprate_b", DROPRATE_LABEL, config)
+    config = _replace_deprecated_option("droprate", DROP_RATE, config)
+    config = _replace_deprecated_option("droprate_a", DROP_RATE_DIALOGUE, config)
+    config = _replace_deprecated_option("droprate_b", DROP_RATE_LABEL, config)
     config = _replace_deprecated_option(
         "hidden_layers_sizes_a", [HIDDEN_LAYERS_SIZES, TEXT], config
     )
@@ -163,7 +165,7 @@ def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
     config = _replace_deprecated_option("mu_neg", MAX_NEG_SIM, config)
     config = _replace_deprecated_option("use_max_sim_neg", USE_MAX_NEG_SIM, config)
     config = _replace_deprecated_option("C2", REGULARIZATION_CONSTANT, config)
-    config = _replace_deprecated_option("C_emb", NEG_MARGIN_SCALE, config)
+    config = _replace_deprecated_option("C_emb", NEGATIVE_MARGIN_SCALE, config)
     config = _replace_deprecated_option(
         "evaluate_every_num_epochs", EVAL_NUM_EPOCHS, config
     )

From 6378522cd25dbea6068ffbefe260d86bf6929eb1 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 15:32:20 +0100
Subject: [PATCH 504/633] bump tensorflow text to use latest versions

---
 alt_requirements/requirements_pretrained_embeddings_convert.txt | 2 +-
 setup.py                                                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/alt_requirements/requirements_pretrained_embeddings_convert.txt b/alt_requirements/requirements_pretrained_embeddings_convert.txt
index 7a96d5bba9e3..a7bc4d785bf8 100644
--- a/alt_requirements/requirements_pretrained_embeddings_convert.txt
+++ b/alt_requirements/requirements_pretrained_embeddings_convert.txt
@@ -1,5 +1,5 @@
 # Minimum Install Requirements
 -r ../requirements.txt
 
-tensorflow_text==2.1.0rc0
+tensorflow_text>=2.1.0rc0
 tensorflow_hub==0.7.0
diff --git a/setup.py b/setup.py
index f4684ea9eb61..ada697ebd49a 100644
--- a/setup.py
+++ b/setup.py
@@ -91,7 +91,7 @@
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
-    "convert": ["tensorflow_text~=2.1.0rc0", "tensorflow_hub~=0.7.0"],
+    "convert": ["tensorflow_text>=2.1.0rc0", "tensorflow_hub~=0.7.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],

From c8c3c3b380c779dcbed8e15a06e41af7ad32bd21 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 15:40:57 +0100
Subject: [PATCH 505/633] fixing persisting lexical syntactic featurizer

---
 .../sparse_featurizer/lexical_syntactic_featurizer.py  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index ca6cf920a406..aad615727f98 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 import os
-import pickle
 import scipy.sparse
 from typing import Any, Dict, Optional, Text, List
 
@@ -15,6 +14,7 @@
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES
 from rasa.nlu.model import Metadata
+import rasa.utils.io as io_utils
 
 logger = logging.getLogger(__name__)
 
@@ -280,9 +280,9 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
         """Persist this model into the passed directory.
         Return the metadata necessary to load the model again."""
 
-        with open(
-            os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "wb"
-        ) as f:
-            json.dump(self.feature_to_idx_dict, f)
+        feature_to_idx_file = os.path.join(
+            model_dir, file_name + ".feature_to_idx_dict.pkl"
+        )
+        io_utils.dump_obj_as_json_to_file(feature_to_idx_file, self.feature_to_idx_dict)
 
         return {"file": file_name}

From 14dfdf5e1762e87217ae7b593c698734ae24faf8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 16:27:22 +0100
Subject: [PATCH 506/633] improve docstrings of components

---
 docs/core/policies.rst                        |   2 +-
 rasa/core/policies/embedding_policy.py        | 125 ++++++++------
 rasa/core/policies/ted_policy.py              | 125 ++++++++------
 rasa/nlu/classifiers/diet_classifier.py       | 155 +++++++++---------
 .../embedding_intent_classifier.py            | 102 +++++++-----
 rasa/nlu/selectors/response_selector.py       |  97 +++++------
 6 files changed, 331 insertions(+), 275 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 60cade541abf..0f1b36cb9924 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -208,7 +208,7 @@ TED Policy
 ^^^^^^^^^^
 
 Transformer Embedding Dialogue (TED) Policy is described in
-`our paper <https://github.com/PolyAI-LDN/polyai-models#convert>`__.
+`our paper <https://arxiv.org/abs/1910.00486>`__.
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 95125ff035f8..4b3fbfc621a5 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -43,82 +43,99 @@
 
 
 class EmbeddingPolicy(TEDPolicy):
-    """Transformer Embedding Dialogue (TED) Policy.
+    """Transformer Embedding Dialogue (TED) Policy is described in
+    https://arxiv.org/abs/1910.00486.
 
-    The policy used in our paper https://arxiv.org/abs/1910.00486.
+    This policy has a pre-defined architecture, which comprises the
+    following steps:
+        - concatenate user input (user intent and entities), previous system actions,
+          slots and active forms for each time step into an input vector to
+          pre-transformer embedding layer;
+        - feed it to transformer;
+        - apply a dense layer to the output of the transformer to get embeddings of a
+          dialogue for each time step;
+        - apply a dense layer to create embeddings for system actions for each time
+          step;
+        - calculate the similarity between the dialogue embedding and embedded system
+          actions. This step is based on the StarSpace
+          (https://arxiv.org/abs/1709.03856) idea.
     """
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
-        # nn architecture
-        # a list of hidden layers sizes before dialogue and action embedding layers
-        # number of hidden layers is equal to the length of this list
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the dialogue and label embedding layers.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
         HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
-        # number of units in transformer
+        # Number of units in transformer
         TRANSFORMER_SIZE: 128,
-        # number of transformer layers
+        # Number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
-        # max sequence length if pos_encoding='emb'
+        # If 'True' use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # If 'True' use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # Max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # Max sequence length
         MAX_SEQUENCE_LENGTH: 256,
-        # number of attention heads in transformer
+        # Number of attention heads in transformer
         NUM_HEADS: 4,
-        # training parameters
-        # initial and final batch sizes:
-        # batch size will be linearly increased for each epoch
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [8, 32],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # 'sequence' or 'balanced'
-        # number of epochs
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: "balanced",
+        # Number of epochs to train
         EPOCHS: 1,
-        # set random seed to any int to get reproducible results
+        # Set random seed to any int to get reproducible results
         RANDOM_SEED: None,
-        # embedding parameters
-        # dimension size of embedding vectors
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # the type of the similarity
+        # Number of negative examples to compare to
         NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect labels
-        SIMILARITY_TYPE: "auto",  # 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # 'softmax' or 'margin'
-        # number of top actions to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: "auto",
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: "softmax",
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        MAX_NEG_SIM: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # the number of incorrect labels, the algorithm will minimize
-        # their similarity to the user input during training
-        USE_MAX_NEG_SIM: True,  # flag which loss function to use
-        # scale loss inverse proportionally to confidence of correct prediction
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.2,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
-        # regularization
-        # the scale of regularization
+        # ## Regularization parameters
+        # The scale of regularization
         REGULARIZATION_CONSTANT: 0.001,
-        # the scale of how important is to minimize the maximum similarity
-        # between embeddings of different labels
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,
-        # dropout rate for dial nn
+        # Dropout rate for embedding layers of dialogue features.
         DROP_RATE_DIALOGUE: 0.1,
-        # dropout rate for bot nn
+        # Dropout rate for embedding layers of label, e.g. action, features.
         DROP_RATE_LABEL: 0.0,
-        # dropout rate for attention
+        # Dropout rate for attention.
         DROP_RATE_ATTENTION: 0,
-        # visualization of accuracy
-        # how often calculate validation accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for hold out validation set
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
-        # if true use key relative embeddings in attention
-        KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
-        VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance.
+        EVAL_NUM_EXAMPLES: 0,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     def __init__(
         self,
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 4a84cd69f70a..8180cfd205ac 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -62,84 +62,101 @@
 
 
 class TEDPolicy(Policy):
-    """Transformer Embedding Dialogue (TED) Policy.
-
-    The policy used in our paper https://arxiv.org/abs/1910.00486.
+    """Transformer Embedding Dialogue (TED) Policy is described in
+    https://arxiv.org/abs/1910.00486.
+
+    This policy has a pre-defined architecture, which comprises the
+    following steps:
+        - concatenate user input (user intent and entities), previous system actions,
+          slots and active forms for each time step into an input vector to
+          pre-transformer embedding layer;
+        - feed it to transformer;
+        - apply a dense layer to the output of the transformer to get embeddings of a
+          dialogue for each time step;
+        - apply a dense layer to create embeddings for system actions for each time
+          step;
+        - calculate the similarity between the dialogue embedding and embedded system
+          actions. This step is based on the StarSpace
+          (https://arxiv.org/abs/1709.03856) idea.
     """
 
     SUPPORTS_ONLINE_TRAINING = True
 
-    # default properties (DOC MARKER - don't remove)
     defaults = {
-        # nn architecture
-        # a list of hidden layers sizes before dialogue and action embedding layers
-        # number of hidden layers is equal to the length of this list
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the dialogue and label embedding layers.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
         HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
-        # number of units in transformer
+        # Number of units in transformer
         TRANSFORMER_SIZE: 128,
-        # number of transformer layers
+        # Number of transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
-        # max sequence length
-        MAX_SEQUENCE_LENGTH: 256,
-        # number of attention heads in transformer
-        NUM_HEADS: 4,
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
         KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
+        # Max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
-        # training parameters
-        # initial and final batch sizes:
-        # batch size will be linearly increased for each epoch
+        # Max sequence length
+        MAX_SEQUENCE_LENGTH: 256,
+        # Number of attention heads in transformer
+        NUM_HEADS: 4,
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [8, 32],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # 'sequence' or 'balanced'
-        # number of epochs
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: "balanced",
+        # Number of epochs to train
         EPOCHS: 1,
-        # set random seed to any int to get reproducible results
+        # Set random seed to any int to get reproducible results
         RANDOM_SEED: None,
-        # embedding parameters
-        # dimension size of embedding vectors
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # the type of the similarity
+        # Number of negative examples to compare to
         NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect labels
-        SIMILARITY_TYPE: "auto",  # 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # 'softmax' or 'margin'
-        # number of top actions to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: "auto",
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: "softmax",
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        MAX_NEG_SIM: -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # the number of incorrect labels, the algorithm will minimize
-        # their similarity to the user input during training
-        USE_MAX_NEG_SIM: True,  # flag which loss function to use
-        # scale loss inverse proportionally to confidence of correct prediction
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.2,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
-        # regularization
-        # the scale of regularization
+        # ## Regularization parameters
+        # The scale of regularization
         REGULARIZATION_CONSTANT: 0.001,
-        # the scale of how important is to minimize the maximum similarity
-        # between embeddings of different labels
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,
-        # dropout rate for dial nn
+        # Dropout rate for embedding layers of dialogue features.
         DROP_RATE_DIALOGUE: 0.1,
-        # dropout rate for bot nn
+        # Dropout rate for embedding layers of label, e.g. action, features.
         DROP_RATE_LABEL: 0.0,
-        # dropout rate for attention
+        # Dropout rate for attention.
         DROP_RATE_ATTENTION: 0,
-        # visualization of accuracy
-        # how often calculate validation accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for hold out validation set
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance.
+        EVAL_NUM_EXAMPLES: 0,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     @staticmethod
     def _standard_featurizer(max_history: Optional[int] = None) -> TrackerFeaturizer:
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c1307667042d..e7b713503fa5 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -79,119 +79,120 @@
 
 
 class DIETClassifier(EntityExtractor):
-    """label classifier using supervised embeddings.
-
-    The embedding intent classifier embeds user inputs
-    and intent labels into the same space.
-    Supervised embeddings are trained by maximizing similarity between them.
-    It also provides rankings of the labels that did not "win".
-
-    The embedding intent classifier needs to be preceded by
-    a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that
-    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
-    However, in this implementation the `mu` parameter is treated differently
-    and additional hidden layers are added together with dropout.
+    """DIET (Dual Intent and Entity Transformer) is a multi-task architecture for
+    intent classification and entity recognition.
+
+    The architecture is based on a transformer which is shared for both tasks.
+    A sequence of entity labels is predicted through a Conditional Random Field (CRF)
+    tagging layer on top of the transformer output sequence corresponding to the
+    input sequence of tokens. The transformer output for the ``__CLS__`` token and
+    intent labels are embedded into a single semantic vector space. We use the
+    dot-product loss to maximize the similarity with the target label and minimize
+    similarities with negative samples.
     """
 
     provides = ["intent", "intent_ranking", "entities"]
 
     requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
 
-    # default properties (DOC MARKER - don't remove)
+    # please make sure to update the docs when changing a default parameter
     defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer
-        # for input words and intent labels,
-        # the number of hidden layers is thus equal to the length of this list
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
         HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
-        # Whether to share the hidden layer weights between input words and labels
+        # Whether to share the hidden layer weights between user message and labels.
         SHARE_HIDDEN_LAYERS: False,
-        # number of units in transformer
+        # Number of units in transformer
         TRANSFORMER_SIZE: 256,
-        # number of transformer layers
+        # Number of transformer layers
         NUM_TRANSFORMER_LAYERS: 2,
-        # number of attention heads in transformer
+        # Number of attention heads in transformer
         NUM_HEADS: 4,
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
         KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
+        # Max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
-        # max sequence length
+        # Max sequence length
         MAX_SEQUENCE_LENGTH: 256,
-        # use a unidirectional or bidirectional encoder
+        # Use a unidirectional or bidirectional encoder.
         UNIDIRECTIONAL_ENCODER: False,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [64, 256],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: "balanced",
+        # Number of epochs to train
         EPOCHS: 300,
-        # set random seed to any int to get reproducible results
+        # Set random seed to any int to get reproducible results
         RANDOM_SEED: None,
-        # optimizer
+        # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
-        # dimension size of embedding vectors
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # the type of the similarity
+        # Default dense dimension to use if no dense features are present.
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
+        # Number of negative examples to compare to
         NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
-        # number of top intents to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: "auto",
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: "softmax",
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for incorrect labels
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.4,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         USE_MAX_NEG_SIM: True,
-        # scale loss inverse proportionally to confidence of correct prediction
+        # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
-        # regularization parameters
-        # the scale of L2 regularization
+        # ## Regularization parameters
+        # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different labels
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,
-        # dropout rate for encoder
+        # Dropout rate for encoder
         DROP_RATE: 0.2,
-        # dropout rate for attention
+        # Dropout rate for attention
         DROP_RATE_ATTENTION: 0,
-        # if true apply dropout to sparse tensors
+        # If 'True' apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: True,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
-        # model config
-        # if true intent classification is trained and intent predicted
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance.
+        EVAL_NUM_EXAMPLES: 0,
+        # ## Model config
+        # If 'True' intent classification is trained and intent predicted.
         INTENT_CLASSIFICATION: True,
-        # if true named entity recognition is trained and entities predicted
+        # If 'True' named entity recognition is trained and entities predicted.
         ENTITY_RECOGNITION: True,
-        # if true random tokens of the input message will be masked and the model
-        # should predict those tokens
+        # If 'True' random tokens of the input message will be masked and the model
+        # should predict those tokens.
         MASKED_LM: False,
-        # BILOU_flag determines whether to use BILOU tagging or not.
-        # More rigorous however requires more examples per entity
-        # rule of thumb: use only if more than 100 egs. per entity
+        # 'BILOU_flag' determines whether to use BILOU tagging or not.
+        # If set to 'True' labelling is more rigorous, however more
+        # examples per entity are required.
+        # Rule of thumb: you should have more than 100 examples per entity.
         BILOU_FLAG: True,
     }
-    # end default properties (DOC MARKER - don't remove)
 
     # init helpers
     def _check_config_parameters(self) -> None:
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7c9a427804c8..b2c06a6cdd49 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,7 +1,6 @@
 import logging
 from typing import Any, Dict, Optional, Text
 
-from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.components import any_of
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES
@@ -43,6 +42,16 @@
 
 
 class EmbeddingIntentClassifier(DIETClassifier):
+    """Dual Intent Entity Transformer used for intent classification.
+
+    The ``EmbeddingIntentClassifier`` embeds user inputs and intent labels into the
+    same space.
+    Supervised embeddings are trained by maximizing similarity between them.
+    This algorithm is based on `StarSpace <https://arxiv.org/abs/1709.03856>`_.
+    However, in this implementation the loss function is slightly different and
+    additional hidden layers are added together with dropout.
+    This algorithm also provides similarity rankings of the labels that did not "win".
+    """
 
     provides = ["intent", "intent_ranking"]
 
@@ -50,63 +59,70 @@ class EmbeddingIntentClassifier(DIETClassifier):
 
     # please make sure to update the docs when changing a default parameter
     defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer
-        # for input words and intent labels,
-        # the number of hidden layers is thus equal to the length of this list
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
         HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: []},
-        # Whether to share the hidden layer weights between input words and labels
+        # Whether to share the hidden layer weights between user message and labels.
         SHARE_HIDDEN_LAYERS: False,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [64, 256],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: "balanced",
+        # Number of epochs to train
         EPOCHS: 300,
-        # set random seed to any int to get reproducible results
+        # Set random seed to any int to get reproducible results
         RANDOM_SEED: None,
-        # optimizer
+        # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
-        # dimension size of embedding vectors
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # the type of the similarity
+        # Default dense dimension to use if no dense features are present.
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
+        # Number of negative examples to compare to
         NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
-        # number of top intents to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: "auto",
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: "softmax",
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for incorrect labels
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.4,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         USE_MAX_NEG_SIM: True,
-        # scale loss inverse proportionally to confidence of correct prediction
+        # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
-        # regularization parameters
-        # the scale of regularization
+        # ## Regularization parameters
+        # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different labels
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,
-        # dropout rate for rnn
+        # Dropout rate for encoder
         DROP_RATE: 0.2,
-        # if true apply dropout to sparse tensors
+        # If 'True' apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance.
+        EVAL_NUM_EXAMPLES: 0,
     }
 
     def __init__(
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 86fd50bbdc20..099204c6d2a3 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -81,67 +81,72 @@ class ResponseSelector(DIETClassifier):
 
     # please make sure to update the docs when changing a default parameter
     defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer
-        # for input words and responses
-        # the number of hidden layers is thus equal to the length of this list
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
         HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
-        # Whether to share the hidden layer weights between input words and intent
-        # labels
+        # Whether to share the hidden layer weights between user message and labels.
         SHARE_HIDDEN_LAYERS: False,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [64, 256],
-        # how to create batches
-        BATCH_STRATEGY: "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: "balanced",
+        # Number of epochs to train
         EPOCHS: 300,
-        # set random seed to any int to get reproducible results
+        # Set random seed to any int to get reproducible results
         RANDOM_SEED: None,
-        # optimizer
+        # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        DENSE_DIMENSION: {TEXT: 512, LABEL: 512},
-        # dimension size of embedding vectors
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # the type of the similarity
+        # Default dense dimension to use if no dense features are present.
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 512},
+        # Number of negative examples to compare to
         NUM_NEG: 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        SIMILARITY_TYPE: "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        LOSS_TYPE: "softmax",  # string 'softmax' or 'margin'
-        # number of top responses to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: "auto",
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: "softmax",
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct intent labels
-        MAX_POS_SIM: 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect intent labels
-        MAX_NEG_SIM: -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for
-        # incorrect intent labels
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.4,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         USE_MAX_NEG_SIM: True,
-        # scale loss inverse proportionally to confidence of correct prediction
+        # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
-        # regularization parameters
-        # the scale of L2 regularization
+        # ## Regularization parameters
+        # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different intent labels
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,
-        # dropout rate for rnn
+        # Dropout rate for encoder
         DROP_RATE: 0.2,
-        # if true apply dropout to sparse tensors
+        # If 'True' apply dropout to sparse tensors
         SPARSE_INPUT_DROPOUT: False,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        EVAL_NUM_EPOCHS: 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        EVAL_NUM_EXAMPLES: 0,  # large values may hurt performance,
-        # selector config
-        # name of the intent for which this response selector is to be trained
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance.
+        EVAL_NUM_EXAMPLES: 0,
+        # ## Selector config
+        # Name of the intent for which this response selector is to be trained
         RETRIEVAL_INTENT: None,
     }
 

From 45a11a1dd0ac49a1bdeca0acba986ffcf2295436 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 17:01:52 +0100
Subject: [PATCH 507/633] update docs on model options

---
 docs/core/policies.rst                        | 113 +++---
 docs/nlu/components.rst                       | 335 ++++++++++--------
 rasa/core/policies/embedding_policy.py        |  14 +-
 rasa/core/policies/ted_policy.py              |  16 +-
 rasa/nlu/classifiers/diet_classifier.py       |  13 +-
 .../embedding_intent_classifier.py            |  13 +-
 rasa/nlu/selectors/response_selector.py       |  13 +-
 7 files changed, 278 insertions(+), 239 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 0f1b36cb9924..a9073b34ee5e 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -351,73 +351,80 @@ It is recommended to use ``state_featurizer=LabelTokenizerSingleStateFeaturizer(
 
     .. code-block:: yaml
 
-        # Architecture of the used neural network.
-        # a list of hidden layers sizes before dialogue and action embed layers
-        # number of hidden layers is equal to the length of this list
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the dialogue and label embedding layers.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
         "hidden_layers_sizes": {"dialogue": [], "label": []}
-        # number of units in transformer
+        # Number of units in transformer
         "transformer_size": 128
-        # number of transformer layers
+        # Number of transformer layers
         "number_of_transformer_layers": 1
-        # max sequence length
-        "maximum_sequence_length": 256
-        # number of attention heads in transformer
-        "number_of_attention_heads": 4
-        # if true use key relative embeddings in attention
-        "use_key_relative_attention": False
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
+        "use_key_relative_attention": False,
+        # If 'True' use key relative embeddings in attention
         "use_value_relative_attention": False
-        # max position for relative embeddings
+        # Max position for relative embeddings
         "max_relative_position": None
-        # training parameters
-        # initial and final batch sizes:
-        # batch size will be linearly increased for each epoch
+        # Max sequence length
+        "maximum_sequence_length": 256
+        # Number of attention heads in transformer
+        "number_of_attention_heads": 4
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
         "batch_size": [8, 32]
-        # how to create batches
-        "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-        # number of epochs
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        "batch_strategy": "balanced"
+        # Number of epochs to train
         "epochs": 1
-        # set random seed to any int to get reproducible results
+        # Set random seed to any 'int' to get reproducible results
         "random_seed": None
-        # embedding parameters
-        # dimension size of embedding vectors
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
         "embedding_dimension": 20
-        # the type of the similarity
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         "number_of_negative_examples": 20
-        # flag if minimize only maximum similarity over incorrect labels
-        "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        "loss_type": "softmax"  # string 'softmax' or 'margin'
-        # number of top actions to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        "similarity_type": "auto"
+        # The type of the loss function, either 'softmax' or 'margin'.
+        "loss_type": "softmax"
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
         "ranking_length": 10
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        "maximum_negative_similarity": -0.2  # should be -1.0 < ... < 1.0 for 'cosine'
-        # the number of incorrect labels, the algorithm will minimize
-        # their similarity to the user input during training
-        "use_maximum_negative_similarity": True  # flag which loss function to use
-        # scale loss inverse proportionally to confidence of correct prediction
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        "maximum_positive_similarity": 0.8
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        "maximum_negative_similarity": -0.2
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        "use_maximum_negative_similarity": True
+        # Scale loss inverse proportionally to confidence of correct prediction
         "scale_loss": True
-        # regularization
-        # the scale of regularization
+        # ## Regularization parameters
+        # The scale of regularization
         "regularization_constant": 0.001
-        # the scale of how important is to minimize the maximum similarity
-        # between embeddings of different labels
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
         "negative_margin_scale": 0.8
-        # dropout rate for dial nn
-        "droprate_dialogue": 0.1
-        # dropout rate for bot nn
-        "droprate_label": 0.0
-        # dropout rate for attention
-        "droprate_attention": 0
-        # visualization of accuracy
-        # how often calculate validation accuracy
-        "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-        # how many examples to use for hold out validation set
-        "evaluate_on_number_of_examples": 0  # large values may hurt performance
+        # Dropout rate for embedding layers of dialogue features.
+        "drop_rate_dialogue": 0.1
+        # Dropout rate for embedding layers of label, e.g. action, features.
+        "drop_rate_label": 0.0
+        # Dropout rate for attention.
+        "drop_rate_attention": 0
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        "evaluate_every_number_of_epochs": 20
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        "evaluate_on_number_of_examples": 0
 
     .. note::
 
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 5a3386198d7c..debe2988ad84 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -819,7 +819,7 @@ EmbeddingIntentClassifier
             - ``regularization_constant`` sets the scale of L2 regularization.
             - ``negative_margin_scale`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different intent labels.
-            - ``droprate`` sets the dropout rate, it should be
+            - ``drop_rate`` sets the dropout rate, it should be
               between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
             - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
@@ -842,63 +842,71 @@ EmbeddingIntentClassifier
 
         pipeline:
         - name: "EmbeddingIntentClassifier"
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
             "hidden_layers_sizes": {"text": [256, 128], "label": []}
-            # Whether to share the hidden layer weights between input words and labels
+            # Whether to share the hidden layer weights between user message and labels.
             "share_hidden_layers": False
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
             "batch_size": [64, 256]
-            # how to create batches
-            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-            # number of epochs
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced"
+            # Number of epochs to train
             "epochs": 300
-            # set random seed to any int to get reproducible results
+            # Set random seed to any 'int' to get reproducible results
             "random_seed": None
-            # optimizer
+            # Initial learning rate for the optimizer
             "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512, "label": 20}
-            # dimension size of embedding vectors
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
             "embedding_dimension": 20
-            # the type of the similarity
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {"text": 512, "label": 20}
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
             "number_of_negative_examples": 20
-            # flag if minimize only maximum similarity over incorrect actions
-            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-            # the type of the loss function
-            "loss_type": "softmax"  # string 'softmax' or 'margin'
-            # number of top intents to normalize scores for softmax loss_type
-            # set to 0 to turn off normalization
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto"
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax"
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
             "ranking_length": 10
-            # how similar the algorithm should try
-            # to make embedding vectors for correct labels
-            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-            # maximum negative similarity for incorrect labels
-            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
-            # flag: if true, only minimize the maximum similarity for incorrect labels
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
             "use_maximum_negative_similarity": True
-            # scale loss inverse proportionally to confidence of correct prediction
+            # Scale loss inverse proportionally to confidence of correct prediction
             "scale_loss": True
-            # regularization parameters
-            # the scale of regularization
+            # ## Regularization parameters
+            # The scale of regularization
             "regularization_constant": 0.002
-            # the scale of how critical the algorithm should be of minimizing the
-            # maximum similarity between embeddings of different labels
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
             "negative_margin_scale": 0.8
-            # dropout rate for rnn
-            "droprate": 0.2
-            # if true apply dropout to sparse tensors
+            # Dropout rate for encoder
+            "drop_rate": 0.2
+            # If 'True' apply dropout to sparse tensors
             "use_sparse_input_dropout": False
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0
 
 .. _keyword_intent_classifier:
 
@@ -992,65 +1000,73 @@ ResponseSelector
 
         pipeline:
         - name: "ResponseSelector"
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [], "label": []}
-            # Whether to share the hidden layer weights between input words and labels
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
+            "hidden_layers_sizes": {"text": [256, 128], "label": [256, 128]}
+            # Whether to share the hidden layer weights between user message and labels.
             "share_hidden_layers": False
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
             "batch_size": [64, 256]
-            # how to create batches
-            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-            # number of epochs
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced"
+            # Number of epochs to train
             "epochs": 300
-            # set random seed to any int to get reproducible results
+            # Set random seed to any 'int' to get reproducible results
             "random_seed": None
-            # optimizer
+            # Initial learning rate for the optimizer
             "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512, "label": 512}
-            # dimension size of embedding vectors
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
             "embedding_dimension": 20
-            # the type of the similarity
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {"text": 512, "label": 512}
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
             "number_of_negative_examples": 20
-            # flag if minimize only maximum similarity over incorrect actions
-            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-            # the type of the loss function
-            "loss_type": "softmax"  # string 'softmax' or 'margin'
-            # number of top intents to normalize scores for softmax loss_type
-            # set to 0 to turn off normalization
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto"
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax"
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
             "ranking_length": 10
-            # how similar the algorithm should try
-            # to make embedding vectors for correct labels
-            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-            # maximum negative similarity for incorrect labels
-            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
-            # flag: if true, only minimize the maximum similarity for incorrect labels
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
             "use_maximum_negative_similarity": True
-            # scale loss inverse proportionally to confidence of correct prediction
+            # Scale loss inverse proportionally to confidence of correct prediction
             "scale_loss": True
-            # regularization parameters
-            # the scale of regularization
+            # ## Regularization parameters
+            # The scale of regularization
             "regularization_constant": 0.002
-            # the scale of how critical the algorithm should be of minimizing the
-            # maximum similarity between embeddings of different labels
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
             "negative_margin_scale": 0.8
-            # dropout rate for rnn
-            "droprate": 0.2
-            # if true apply dropout to sparse tensors
-            "use_sparse_input_dropout": True
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
-            # selector config
-            # name of the intent for which this response selector is to be trained
+            # Dropout rate for encoder
+            "drop_rate": 0.2
+            # If 'True' apply dropout to sparse tensors
+            "use_sparse_input_dropout": False
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0
+            # ## Selector config
+            # Name of the intent for which this response selector is to be trained
             "retrieval_intent": None
 
 
@@ -1641,90 +1657,99 @@ DIETClassifier
 
         pipeline:
         - name: "DIETClassifier"
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [], "label": []}
-            # Whether to share the hidden layer weights between input words and labels
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
+            "hidden_layers_sizes": {TEXT: [], LABEL: []}
+            # Whether to share the hidden layer weights between user message and labels.
             "share_hidden_layers": False
-            # number of units in transformer
+            # Number of units in transformer
             "transformer_size": 256
-            # number of transformer layers
+            # Number of transformer layers
             "number_of_transformer_layers": 2
-            # number of attention heads in transformer
+            # Number of attention heads in transformer
             "number_of_attention_heads": 4
-            # max sequence length
-            "maximum_sequence_length": 256
-            # use a unidirectional or bidirectional encoder
-            "unidirectional_encoder": False
-            # if true use key relative embeddings in attention
+            # If 'True' use key relative embeddings in attention
             "use_key_relative_attention": False
-            # if true use key relative embeddings in attention
+            # If 'True' use key relative embeddings in attention
             "use_value_relative_attention": False
-            # max position for relative embeddings
+            # Max position for relative embeddings
             "max_relative_position": None
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
+            # Max sequence length
+            "maximum_sequence_length": 256
+            # Use a unidirectional or bidirectional encoder.
+            "unidirectional_encoder": False
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
             "batch_size": [64, 256]
-            # how to create batches
-            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-            # number of epochs
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced"
+            # Number of epochs to train
             "epochs": 300
-            # set random seed to any int to get reproducible results
+            # Set random seed to any 'int' to get reproducible results
             "random_seed": None
-            # optimizer
+            # Initial learning rate for the optimizer
             "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512, "label": 20}
-            # dimension size of embedding vectors
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
             "embedding_dimension": 20
-            # the type of the similarity
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {TEXT: 512, LABEL: 20}
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
             "number_of_negative_examples": 20
-            # flag if minimize only maximum similarity over incorrect actions
-            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-            # the type of the loss function
-            "loss_type": "softmax"  # string 'softmax' or 'margin'
-            # number of top intents to normalize scores for softmax loss_type
-            # set to 0 to turn off normalization
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto"
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax"
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
             "ranking_length": 10
-            # how similar the algorithm should try
-            # to make embedding vectors for correct labels
-            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-            # maximum negative similarity for incorrect labels
-            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
-            # flag: if true, only minimize the maximum similarity for incorrect labels
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
             "use_maximum_negative_similarity": True
-            # scale loss inverse proportionally to confidence of correct prediction
+            # Scale loss inverse proportionally to confidence of correct prediction
             "scale_loss": True
-            # regularization parameters
-            # the scale of regularization
+            # ## Regularization parameters
+            # The scale of regularization
             "regularization_constant": 0.002
-            # the scale of how critical the algorithm should be of minimizing the
-            # maximum similarity between embeddings of different labels
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
             "negative_margin_scale": 0.8
-            # dropout rate for rnn
-            "droprate": 0.2
-            # dropout rate for attention
-            "droprate_attention": 0
-            # if true apply dropout to sparse tensors
+            # Dropout rate for encoder
+            "drop_rate": 0.2
+            # Dropout rate for attention
+            "drop_rate_attention": 0
+            # If 'True' apply dropout to sparse tensors
             "use_sparse_input_dropout": True
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
-            # model config
-            # if true intent classification is trained and intent predicted
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0
+            # ## Model config
+            # If 'True' intent classification is trained and intent predicted.
             "intent_classification": True
-            # if true named entity recognition is trained and entities predicted
+            # If 'True' named entity recognition is trained and entities predicted.
             "entity_recognition": True
-            # if true random tokens of the input message will be masked and the model
-            # should predict those tokens
+            # If 'True' random tokens of the input message will be masked and the model
+            # should predict those tokens.
             "use_masked_language_model": False
-            # BILOU_flag determines whether to use BILOU tagging or not.
-            # More rigorous however requires more examples per entity
-            # rule of thumb: use only if more than 100 egs. per entity
+            # 'BILOU_flag' determines whether to use BILOU tagging or not.
+            # If set to 'True' labelling is more rigorous, however more
+            # examples per entity are required.
+            # Rule of thumb: you should have more than 100 examples per entity.
             "BILOU_flag": True
\ No newline at end of file
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 4b3fbfc621a5..b628053e9d08 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -61,6 +61,7 @@ class EmbeddingPolicy(TEDPolicy):
           (https://arxiv.org/abs/1709.03856) idea.
     """
 
+    # please make sure to update the docs when changing a default parameter
     defaults = {
         # ## Architecture of the used neural network
         # Hidden layer sizes for layers before the dialogue and label embedding layers.
@@ -90,12 +91,13 @@ class EmbeddingPolicy(TEDPolicy):
         BATCH_STRATEGY: "balanced",
         # Number of epochs to train
         EPOCHS: 1,
-        # Set random seed to any int to get reproducible results
+        # Set random seed to any 'int' to get reproducible results
         RANDOM_SEED: None,
         # ## Parameters for embeddings
         # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # Number of negative examples to compare to
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: "auto",
@@ -111,8 +113,8 @@ class EmbeddingPolicy(TEDPolicy):
         # Maximum negative similarity for incorrect labels.
         # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_NEG_SIM: -0.2,
-        # The number of incorrect labels. The algorithm will minimize
-        # their similarity to the user input during training.
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
         # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
@@ -130,10 +132,10 @@ class EmbeddingPolicy(TEDPolicy):
         DROP_RATE_ATTENTION: 0,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
-        # Small values may hurt performance.
+        # Small values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EPOCHS: 20,
         # How many examples to use for hold out validation set
-        # Large values may hurt performance.
+        # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
     }
 
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 8180cfd205ac..96f663986053 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -82,6 +82,7 @@ class TEDPolicy(Policy):
 
     SUPPORTS_ONLINE_TRAINING = True
 
+    # please make sure to update the docs when changing a default parameter
     defaults = {
         # ## Architecture of the used neural network
         # Hidden layer sizes for layers before the dialogue and label embedding layers.
@@ -106,17 +107,18 @@ class TEDPolicy(Policy):
         # Initial and final batch sizes:
         # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [8, 32],
-        # Strategy used when creating batches.
+        # Strategy used whenc creating batches.
         # Can be either 'sequence' or 'balanced'.
         BATCH_STRATEGY: "balanced",
         # Number of epochs to train
         EPOCHS: 1,
-        # Set random seed to any int to get reproducible results
+        # Set random seed to any 'int' to get reproducible results
         RANDOM_SEED: None,
         # ## Parameters for embeddings
         # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # Number of negative examples to compare to
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: "auto",
@@ -132,8 +134,8 @@ class TEDPolicy(Policy):
         # Maximum negative similarity for incorrect labels.
         # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_NEG_SIM: -0.2,
-        # The number of incorrect labels. The algorithm will minimize
-        # their similarity to the user input during training.
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
         # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
@@ -151,10 +153,10 @@ class TEDPolicy(Policy):
         DROP_RATE_ATTENTION: 0,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
-        # Small values may hurt performance.
+        # Small values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EPOCHS: 20,
         # How many examples to use for hold out validation set
-        # Large values may hurt performance.
+        # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
     }
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index e7b713503fa5..94c7b2fa09f1 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -130,7 +130,7 @@ class DIETClassifier(EntityExtractor):
         BATCH_STRATEGY: "balanced",
         # Number of epochs to train
         EPOCHS: 300,
-        # Set random seed to any int to get reproducible results
+        # Set random seed to any 'int' to get reproducible results
         RANDOM_SEED: None,
         # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
@@ -139,7 +139,8 @@ class DIETClassifier(EntityExtractor):
         EMBEDDING_DIMENSION: 20,
         # Default dense dimension to use if no dense features are present.
         DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
-        # Number of negative examples to compare to
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: "auto",
@@ -155,8 +156,8 @@ class DIETClassifier(EntityExtractor):
         # Maximum negative similarity for incorrect labels.
         # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_NEG_SIM: -0.4,
-        # The number of incorrect labels. The algorithm will minimize
-        # their similarity to the user input during training.
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
         # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
@@ -174,10 +175,10 @@ class DIETClassifier(EntityExtractor):
         SPARSE_INPUT_DROPOUT: True,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
-        # Small values may hurt performance.
+        # Small values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EPOCHS: 20,
         # How many examples to use for hold out validation set
-        # Large values may hurt performance.
+        # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
         # ## Model config
         # If 'True' intent classification is trained and intent predicted.
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b2c06a6cdd49..b8bc881b763a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -76,7 +76,7 @@ class EmbeddingIntentClassifier(DIETClassifier):
         BATCH_STRATEGY: "balanced",
         # Number of epochs to train
         EPOCHS: 300,
-        # Set random seed to any int to get reproducible results
+        # Set random seed to any 'int' to get reproducible results
         RANDOM_SEED: None,
         # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
@@ -85,7 +85,8 @@ class EmbeddingIntentClassifier(DIETClassifier):
         EMBEDDING_DIMENSION: 20,
         # Default dense dimension to use if no dense features are present.
         DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
-        # Number of negative examples to compare to
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: "auto",
@@ -101,8 +102,8 @@ class EmbeddingIntentClassifier(DIETClassifier):
         # Maximum negative similarity for incorrect labels.
         # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_NEG_SIM: -0.4,
-        # The number of incorrect labels. The algorithm will minimize
-        # their similarity to the user input during training.
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
         # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
@@ -118,10 +119,10 @@ class EmbeddingIntentClassifier(DIETClassifier):
         SPARSE_INPUT_DROPOUT: False,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
-        # Small values may hurt performance.
+        # Small values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EPOCHS: 20,
         # How many examples to use for hold out validation set
-        # Large values may hurt performance.
+        # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
     }
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 099204c6d2a3..a3d400c314cb 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -98,7 +98,7 @@ class ResponseSelector(DIETClassifier):
         BATCH_STRATEGY: "balanced",
         # Number of epochs to train
         EPOCHS: 300,
-        # Set random seed to any int to get reproducible results
+        # Set random seed to any 'int' to get reproducible results
         RANDOM_SEED: None,
         # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
@@ -107,7 +107,8 @@ class ResponseSelector(DIETClassifier):
         EMBEDDING_DIMENSION: 20,
         # Default dense dimension to use if no dense features are present.
         DENSE_DIMENSION: {TEXT: 512, LABEL: 512},
-        # Number of negative examples to compare to
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: "auto",
@@ -123,8 +124,8 @@ class ResponseSelector(DIETClassifier):
         # Maximum negative similarity for incorrect labels.
         # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_NEG_SIM: -0.4,
-        # The number of incorrect labels. The algorithm will minimize
-        # their similarity to the user input during training.
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
         # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
@@ -140,10 +141,10 @@ class ResponseSelector(DIETClassifier):
         SPARSE_INPUT_DROPOUT: False,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
-        # Small values may hurt performance.
+        # Small values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EPOCHS: 20,
         # How many examples to use for hold out validation set
-        # Large values may hurt performance.
+        # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
         # ## Selector config
         # Name of the intent for which this response selector is to be trained

From 179ad4ff48c70bcb7a5007116507586bbab4886d Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 17:08:18 +0100
Subject: [PATCH 508/633] revert back to old requirements

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 452957aea85f..82d4386b34c2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,3 +62,6 @@ python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
 tensorflow-addons==0.7.0
+tensor2tensor==1.14.0
+tensorflow_hub==0.7.0
+tensorflow-probability==0.7.0

From b6b1ff9371df1f0200d526bfe1b494af020f4718 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 17:32:10 +0100
Subject: [PATCH 509/633] fix persisting and loading of ted policy

---
 rasa/core/policies/ted_policy.py | 41 ++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 96f663986053..867cd3730a4d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -61,6 +61,9 @@
 logger = logging.getLogger(__name__)
 
 
+SAVE_MODEL_FILE_NAME = "ted_policy"
+
+
 class TEDPolicy(Policy):
     """Transformer Embedding Dialogue (TED) Policy is described in
     https://arxiv.org/abs/1910.00486.
@@ -389,8 +392,7 @@ def persist(self, path: Text):
             )
             return
 
-        file_name = "ted_policy"
-        tf_model_file = os.path.join(path, f"{file_name}.tf_model")
+        tf_model_file = os.path.join(path, f"{SAVE_MODEL_FILE_NAME}.tf_model")
 
         rasa.utils.io.create_directory_for_file(tf_model_file)
 
@@ -398,16 +400,22 @@ def persist(self, path: Text):
 
         self.model.save(tf_model_file)
 
-        with open(os.path.join(path, file_name + ".priority.pkl"), "wb") as f:
+        with open(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".priority.pkl"), "wb"
+        ) as f:
             pickle.dump(self.priority, f)
 
-        with open(os.path.join(path, file_name + ".meta.pkl"), "wb") as f:
+        with open(os.path.join(path, SAVE_MODEL_FILE_NAME + ".meta.pkl"), "wb") as f:
             pickle.dump(self.config, f)
 
-        with open(os.path.join(path, file_name + ".data_example.pkl"), "wb") as f:
+        with open(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".data_example.pkl"), "wb"
+        ) as f:
             pickle.dump(self.data_example, f)
 
-        with open(os.path.join(path, file_name + ".label_data.pkl"), "wb") as f:
+        with open(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".label_data.pkl"), "wb"
+        ) as f:
             pickle.dump(self._label_data, f)
 
     @classmethod
@@ -423,26 +431,33 @@ def load(cls, path: Text) -> "TEDPolicy":
                 f"'{os.path.abspath(path)}' doesn't exist."
             )
 
-        file_name = "TED_policy"
-        tf_model_file = os.path.join(path, f"{file_name}.tf_model")
+        tf_model_file = os.path.join(path, f"{SAVE_MODEL_FILE_NAME}.tf_model")
 
         featurizer = TrackerFeaturizer.load(path)
 
-        if not os.path.exists(os.path.join(path, file_name + ".data_example.pkl")):
+        if not os.path.exists(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".data_example.pkl")
+        ):
             return cls(featurizer=featurizer)
 
-        with open(os.path.join(path, file_name + ".data_example.pkl"), "rb") as f:
+        with open(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".data_example.pkl"), "rb"
+        ) as f:
             model_data_example = RasaModelData(
                 label_key="label_ids", data=pickle.load(f)
             )
 
-        with open(os.path.join(path, file_name + ".label_data.pkl"), "rb") as f:
+        with open(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".label_data.pkl"), "rb"
+        ) as f:
             label_data = pickle.load(f)
 
-        with open(os.path.join(path, file_name + ".meta.pkl"), "rb") as f:
+        with open(os.path.join(path, SAVE_MODEL_FILE_NAME + ".meta.pkl"), "rb") as f:
             meta = pickle.load(f)
 
-        with open(os.path.join(path, file_name + ".priority.pkl"), "rb") as f:
+        with open(
+            os.path.join(path, SAVE_MODEL_FILE_NAME + ".priority.pkl"), "rb"
+        ) as f:
             priority = pickle.load(f)
 
         meta = train_utils.update_similarity_type(meta)

From 6b4e7b6bcc3be33a018d9b085ad4f1b2b46df90f Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 20 Feb 2020 17:37:55 +0100
Subject: [PATCH 510/633] removed unnecessary deps again

---
 requirements.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 82d4386b34c2..452957aea85f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,6 +62,3 @@ python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
 tensorflow-addons==0.7.0
-tensor2tensor==1.14.0
-tensorflow_hub==0.7.0
-tensorflow-probability==0.7.0

From 7a74f9899627d438536b0b90ca957461008e0e39 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 17:46:43 +0100
Subject: [PATCH 511/633] remove flask from test

---
 tests/core/test_nlg.py | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/tests/core/test_nlg.py b/tests/core/test_nlg.py
index 4d5ac70975d5..fa890404844a 100644
--- a/tests/core/test_nlg.py
+++ b/tests/core/test_nlg.py
@@ -1,13 +1,10 @@
-import asyncio
 import uuid
 from typing import Text, Any
 
 import jsonschema
 import pytest
-from flask import Flask, request, jsonify
-from pytest_localserver.http import WSGIServer
+from sanic import Sanic, response
 
-import rasa.utils.io
 from rasa.core.nlg.callback import (
     nlg_request_format_spec,
     CallbackNaturalLanguageGenerator,
@@ -19,10 +16,11 @@
 
 
 def nlg_app(base_url="/"):
-    app = Flask(__name__)
+
+    app = Sanic(__name__)
 
     @app.route(base_url, methods=["POST"])
-    def generate():
+    async def generate(request):
         """Simple HTTP NLG generator, checks that the incoming request
         is format according to the spec."""
 
@@ -31,28 +29,26 @@ def generate():
         jsonschema.validate(nlg_call, nlg_request_format_spec())
 
         if nlg_call.get("template") == "utter_greet":
-            response = {"text": "Hey there!"}
+            response_dict = {"text": "Hey there!"}
         else:
-            response = {"text": "Sorry, didn't get that."}
-        return jsonify(response)
+            response_dict = {"text": "Sorry, didn't get that."}
+        return response.json(response_dict)
 
     return app
 
 
 # noinspection PyShadowingNames
-@pytest.fixture(scope="module")
-def http_nlg(request):
-    http_server = WSGIServer(application=nlg_app())
-    http_server.start()
-
-    request.addfinalizer(http_server.stop)
-    return http_server.url
+@pytest.fixture()
+async def http_nlg(test_server):
+    server = await test_server(nlg_app())
+    yield server
+    await server.close()
 
 
 async def test_nlg(http_nlg, trained_rasa_model):
     sender = str(uuid.uuid1())
 
-    nlg_endpoint = EndpointConfig.from_dict({"url": http_nlg})
+    nlg_endpoint = EndpointConfig.from_dict({"url": http_nlg.make_url("/")})
     agent = Agent.load(trained_rasa_model, None, generator=nlg_endpoint)
 
     response = await agent.handle_text("/greet", sender_id=sender)

From e9f5a2d62af775ba8608999b26969de9c4b4c4cf Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 19:56:07 +0100
Subject: [PATCH 512/633] don't import raise_warning directly

---
 rasa/core/policies/embedding_policy.py              |  5 ++---
 rasa/core/policies/keras_policy.py                  | 13 +++++++------
 rasa/nlu/classifiers/embedding_intent_classifier.py |  4 ++--
 rasa/nlu/classifiers/sklearn_intent_classifier.py   |  8 ++++----
 rasa/nlu/config.py                                  | 10 +++++-----
 rasa/nlu/extractors/crf_entity_extractor.py         |  8 ++++----
 .../dense_featurizer/convert_featurizer.py          |  4 ++--
 .../sparse_featurizer/count_vectors_featurizer.py   |  4 ++--
 .../sparse_featurizer/regex_featurizer.py           |  5 ++---
 rasa/nlu/selectors/response_selector.py             |  4 ++--
 10 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b628053e9d08..c3fa92ac97dc 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -4,7 +4,6 @@
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.ted_policy import TEDPolicy
-from rasa.constants import DOCS_URL_POLICIES
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -36,8 +35,8 @@
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
 )
-from rasa.utils.common import raise_warning
 from rasa.utils.tensorflow.models import RasaModel
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
@@ -150,7 +149,7 @@ def __init__(
 
         super().__init__(featurizer, priority, max_history, model, **kwargs)
 
-        raise_warning(
+        common_utils.raise_warning(
             f"'EmbeddingPolicy' is deprecated and will be removed in version 2.0. "
             f"Use 'TEDPolicy' instead.",
             category=FutureWarning,
diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 4e00b3cb2f5d..c3dd1ed5ba36 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -14,17 +14,18 @@
     MaxHistoryTrackerFeaturizer,
     BinarySingleStateFeaturizer,
 )
-from rasa.constants import DOCS_URL_POLICIES
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.common import obtain_verbosity, raise_warning
+import rasa.utils.common as common_utils
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
 
+
 # there are a number of issues with imports from tensorflow. hence the deactivation
 # pytype: disable=import-error
 # pytype: disable=module-attr
 
+
 try:
     import cPickle as pickle
 except ImportError:
@@ -71,7 +72,7 @@ def __init__(
 
         self.current_epoch = current_epoch
 
-        raise_warning(
+        common_utils.raise_warning(
             "'KerasPolicy' is deprecated and will be removed in version "
             "2.0. Use 'TEDPolicy' instead.",
             category=FutureWarning,
@@ -151,7 +152,7 @@ def model_architecture(
             loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"]
         )
 
-        if obtain_verbosity() > 0:
+        if common_utils.obtain_verbosity() > 0:
             model.summary()
 
         return model
@@ -194,7 +195,7 @@ def train(
             epochs=self.epochs,
             batch_size=self.batch_size,
             shuffle=False,
-            verbose=obtain_verbosity(),
+            verbose=common_utils.obtain_verbosity(),
             **self._train_params,
         )
         self.current_epoch = self.epochs
@@ -228,7 +229,7 @@ def continue_training(
                 training_data.y,
                 epochs=self.current_epoch + 1,
                 batch_size=len(training_data.y),
-                verbose=obtain_verbosity(),
+                verbose=common_utils.obtain_verbosity(),
                 initial_epoch=self.current_epoch,
             )
 
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b8bc881b763a..f1c472c7fa07 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -35,7 +35,7 @@
     EMBEDDING_DIMENSION,
     BILOU_FLAG,
 )
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 from rasa.utils.tensorflow.models import RasaModel
 
 logger = logging.getLogger(__name__)
@@ -152,7 +152,7 @@ def __init__(
             batch_tuple_sizes,
         )
 
-        raise_warning(
+        common_utils.raise_warning(
             "'EmbeddingIntentClassifier' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 8faf484cce41..a2b6bb161834 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from rasa.constants import DOCS_URL_COMPONENTS, DOCS_URL_TRAINING_DATA_NLU
+from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 from rasa.nlu import utils
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
@@ -15,7 +15,7 @@
 from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
@@ -63,7 +63,7 @@ def __init__(
             self.le = LabelEncoder()
         self.clf = clf
 
-        raise_warning(
+        common_utils.raise_warning(
             "'SklearnIntentClassifier' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
@@ -101,7 +101,7 @@ def train(
         labels = [e.get("intent") for e in training_data.intent_examples]
 
         if len(set(labels)) < 2:
-            raise_warning(
+            common_utils.raise_warning(
                 "Can not train an intent classifier as there are not "
                 "enough intents. Need at least 2 different intents. "
                 "Skipping training of intent classifier.",
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index 7887a582888c..b59af0ee5990 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -7,7 +7,7 @@
 import rasa.utils.io
 from rasa.constants import DEFAULT_CONFIG_PATH, DOCS_URL_PIPELINE
 from rasa.nlu.utils import json_to_string
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
@@ -68,7 +68,7 @@ def component_config_from_pipeline(
         c = pipeline[index]
         return override_defaults(defaults, c)
     except IndexError:
-        raise_warning(
+        common_utils.raise_warning(
             f"Tried to get configuration value for component "
             f"number {index} which is not part of your pipeline. "
             f"Returning `defaults`.",
@@ -103,7 +103,7 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
                 "tensorflow_embedding": "supervised_embeddings",
             }
             if template_name in new_names:
-                raise_warning(
+                common_utils.raise_warning(
                     f"You have specified the pipeline template "
                     f"'{template_name}' which has been renamed to "
                     f"'{new_names[template_name]}'. "
@@ -118,7 +118,7 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
             pipeline = registry.pipeline_template(template_name)
 
             if pipeline:
-                raise_warning(
+                common_utils.raise_warning(
                     "You are using a pipeline template. All pipelines templates "
                     "are deprecated and will be removed in version 2.0. Please add "
                     "the components you want to use directly to your configuration "
@@ -190,7 +190,7 @@ def set_component_attr(self, index, **kwargs) -> None:
         try:
             self.pipeline[index].update(kwargs)
         except IndexError:
-            raise_warning(
+            common_utils.raise_warning(
                 f"Tried to set configuration value for component "
                 f"number {index} which is not part of the pipeline.",
                 docs=DOCS_URL_PIPELINE,
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index b98b5daeb18e..48d23467e3a4 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
 
 import rasa.nlu.utils.bilou_utils as bilou_utils
+import rasa.utils.common as common_utils
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -12,7 +13,6 @@
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import TOKENS_NAMES, TEXT, DENSE_FEATURE_NAMES, ENTITIES
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU, DOCS_URL_COMPONENTS
-from rasa.utils.common import raise_warning
 
 logger = logging.getLogger(__name__)
 
@@ -102,7 +102,7 @@ def __init__(
 
         self._validate_configuration()
 
-        raise_warning(
+        common_utils.raise_warning(
             "'CRFEntityExtractor' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
@@ -448,7 +448,7 @@ def _from_json_to_crf(
                 collected.append(t)
             elif collected:
                 collected_text = " ".join([t.text for t in collected])
-                raise_warning(
+                common_utils.raise_warning(
                     f"Misaligned entity annotation for '{collected_text}' "
                     f"in sentence '{message.text}' with intent "
                     f"'{message.get('intent')}'. "
@@ -484,7 +484,7 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
 
         tokens = message.get(TOKENS_NAMES[TEXT], [])
         if len(tokens) != len(features):
-            raise_warning(
+            common_utils.raise_warning(
                 f"Number of features ({len(features)}) for attribute "
                 f"'{DENSE_FEATURE_NAMES[TEXT]}' "
                 f"does not match number of tokens ({len(tokens)}). Set "
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 356b6af8ca8e..a497c91a6a61 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -18,7 +18,7 @@
 import tensorflow as tf
 import os
 
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
@@ -193,7 +193,7 @@ def train(
     ) -> None:
 
         if config is not None and config.language != "en":
-            raise_warning(
+            common_utils.raise_warning(
                 f"Since ``ConveRT`` model is trained only on an english "
                 f"corpus of conversations, this featurizer should only be "
                 f"used if your training data is in english language. "
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index bd296fab7faf..8e4cc21baa02 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, Optional, Text
 
 from rasa.constants import DOCS_URL_COMPONENTS
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 
 from sklearn.feature_extraction.text import CountVectorizer
 from rasa.nlu import utils
@@ -292,7 +292,7 @@ def _check_OOV_present(self, all_tokens: List[List[Text]]) -> None:
 
         if any(text for tokens in all_tokens for text in tokens):
             # if there is some text in tokens, warn if there is no oov token
-            raise_warning(
+            common_utils.raise_warning(
                 f"The out of vocabulary token '{self.OOV_token}' was configured, but "
                 f"could not be found in any one of the NLU message training examples. "
                 f"All unseen words will be ignored during prediction.",
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index b0ae26c8113c..2709008bad00 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -1,7 +1,6 @@
 import logging
 import os
 import re
-import typing
 from typing import Any, Dict, List, Optional, Text, Union
 
 import numpy as np
@@ -21,7 +20,7 @@
 )
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 from rasa.nlu.model import Metadata
 
 logger = logging.getLogger(__name__)
@@ -140,7 +139,7 @@ def _generate_lookup_regex(
         # if it's a list, it should be the elements directly
         if isinstance(lookup_elements, list):
             elements_to_regex = lookup_elements
-            raise_warning(
+            common_utils.raise_warning(
                 f"Directly including lookup tables as a list is deprecated since Rasa "
                 f"1.6.",
                 FutureWarning,
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index a3d400c314cb..683efec35676 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -47,7 +47,7 @@
 )
 from rasa.utils.tensorflow.model_data import RasaModelData
 from rasa.utils.tensorflow.models import RasaModel
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 from rasa.constants import DOCS_URL_COMPONENTS
 
 logger = logging.getLogger(__name__)
@@ -177,7 +177,7 @@ def __init__(
             batch_tuple_sizes,
         )
 
-        raise_warning(
+        common_utils.raise_warning(
             f"'ResponseSelector' is deprecated and will be removed in version 2.0. "
             f"Use 'DIETSelector' instead.",
             category=FutureWarning,

From 758902746aa2f0c5f2bbd48ac6bf6a43c414f76f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 20:48:21 +0100
Subject: [PATCH 513/633] review comment

---
 rasa/nlu/selectors/response_selector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index f6be9650704a..bb27036ec4b4 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -191,7 +191,7 @@ def __init__(
         # the following properties cannot be adapted for the ResponseSelector
         component_config[INTENT_CLASSIFICATION] = True
         component_config[ENTITY_RECOGNITION] = False
-        component_config[BILOU_FLAG] = False
+        component_config[BILOU_FLAG] = None
 
         super().__init__(
             component_config,

From 72cc0217471f5a9957abc295599383c99630a2c1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 21:28:46 +0100
Subject: [PATCH 514/633] add missing masked_lm option to response selector

---
 rasa/nlu/selectors/response_selector.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index bb27036ec4b4..1b99d2429b4d 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -173,6 +173,9 @@ class ResponseSelector(DIETClassifier):
         # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
         # ## Selector config
+        # If 'True' random tokens of the input message will be masked and the model
+        # should predict those tokens.
+        MASKED_LM: False,
         # Name of the intent for which this response selector is to be trained
         RETRIEVAL_INTENT: None,
     }

From 1c0955052e33f6ac4e9f32f54713aa6e289d7ba7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 20 Feb 2020 22:21:52 +0100
Subject: [PATCH 515/633] Use ResponseSelector instead of DIETSelector

---
 changelog/5266.removal.rst                     | 2 +-
 data/configs_for_docs/default_spacy_config.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/changelog/5266.removal.rst b/changelog/5266.removal.rst
index b95e2242d3af..0afb018ffc41 100644
--- a/changelog/5266.removal.rst
+++ b/changelog/5266.removal.rst
@@ -17,5 +17,5 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
+  - name: ResponseSelector
 ```
\ No newline at end of file
diff --git a/data/configs_for_docs/default_spacy_config.yml b/data/configs_for_docs/default_spacy_config.yml
index 0f3cdbf5e593..46b75c8078c7 100644
--- a/data/configs_for_docs/default_spacy_config.yml
+++ b/data/configs_for_docs/default_spacy_config.yml
@@ -11,4 +11,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: DIETSelector
\ No newline at end of file
+  - name: ResponseSelector
\ No newline at end of file

From 3200732143f2c96fad363b8de35428705cae22f5 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:11:51 +0100
Subject: [PATCH 516/633] clean up NLU tests

---
 tests/conftest.py                             |  14 +-
 tests/nlu/classifiers/test_diet_classifier.py |  32 +++
 tests/nlu/conftest.py                         |  18 +-
 tests/nlu/selectors/test_selectors.py         |   4 +-
 tests/nlu/test_components.py                  |   8 +-
 tests/nlu/test_config.py                      | 141 ++++++++-----
 tests/nlu/test_evaluation.py                  |   7 +-
 tests/nlu/test_interpreter.py                 |  16 +-
 tests/nlu/test_persistor.py                   |  12 +-
 tests/nlu/test_train.py                       | 196 +++++-------------
 tests/nlu/utilities.py                        |  10 +-
 tests/test_train.py                           |  27 ---
 tests/utilities.py                            |   3 +-
 13 files changed, 226 insertions(+), 262 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 4957360c217b..702f694cfcba 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,6 @@
 import asyncio
+import os
+
 from sanic.request import Request
 from sanic.testing import SanicTestClient
 
@@ -29,6 +31,7 @@
     END_TO_END_STORY_FILE,
     MOODBOT_MODEL_PATH,
 )
+from tests.utilities import update_number_of_epochs
 
 DEFAULT_CONFIG_PATH = "rasa/cli/default_config.yml"
 
@@ -74,10 +77,15 @@ async def default_agent(_trained_default_agent: Agent) -> Agent:
 
 
 @pytest.fixture(scope="session")
-async def trained_moodbot_path() -> Text:
+async def trained_moodbot_path(tmpdir_factory: TempdirFactory) -> Text:
+    output = tmpdir_factory.mktemp("moodbot").strpath
+    tmp_config_file = os.path.join(output, "config.yml")
+
+    update_number_of_epochs("examples/moodbot/config.yml", tmp_config_file)
+
     return await train_async(
         domain="examples/moodbot/domain.yml",
-        config="examples/moodbot/config.yml",
+        config=tmp_config_file,
         training_files="examples/moodbot/data/",
         output_path=MOODBOT_MODEL_PATH,
     )
@@ -182,7 +190,7 @@ async def trained_core_model(
 async def trained_nlu_model(
     trained_async,
     default_domain_path: Text,
-    default_config: List[Policy],
+    blank_config,
     default_nlu_data: Text,
     default_stories_file: Text,
 ) -> Text:
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 074072e5ccbf..a32e1c257891 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -262,3 +262,35 @@ async def test_margin_loss_is_not_normalized(
 
     # make sure top ranking is reflected in intent prediction
     assert parse_data.get("intent") == intent_ranking[0]
+
+
+async def test_random_seed(component_builder, tmpdir, supervised_embeddings_config):
+    """test if train result is the same for two runs of tf embedding"""
+
+    # set fixed random seed
+    idx = supervised_embeddings_config.component_names.index(
+        "EmbeddingIntentClassifier"
+    )
+    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
+    idx = supervised_embeddings_config.component_names.index("CRFEntityExtractor")
+    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
+
+    # first run
+    (trained_a, _, persisted_path_a) = await train(
+        supervised_embeddings_config,
+        path=tmpdir.strpath + "_a",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder,
+    )
+    # second run
+    (trained_b, _, persisted_path_b) = await train(
+        supervised_embeddings_config,
+        path=tmpdir.strpath + "_b",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder,
+    )
+    loaded_a = Interpreter.load(persisted_path_a, component_builder)
+    loaded_b = Interpreter.load(persisted_path_b, component_builder)
+    result_a = loaded_a.parse("hello")["intent"]["confidence"]
+    result_b = loaded_b.parse("hello")["intent"]["confidence"]
+    assert result_a == result_b
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index b19320e18717..b3c564698ece 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -16,29 +16,25 @@ def component_builder():
 
 
 @pytest.fixture(scope="session")
-def spacy_nlp(component_builder, default_config):
+def spacy_nlp(component_builder, blank_config):
     spacy_nlp_config = {"name": "SpacyNLP"}
-    return component_builder.create_component(spacy_nlp_config, default_config).nlp
+    return component_builder.create_component(spacy_nlp_config, blank_config).nlp
 
 
 @pytest.fixture(scope="session")
-def spacy_nlp_component(component_builder, default_config):
+def spacy_nlp_component(component_builder, blank_config):
     spacy_nlp_config = {"name": "SpacyNLP"}
-    return component_builder.create_component(spacy_nlp_config, default_config)
+    return component_builder.create_component(spacy_nlp_config, blank_config)
 
 
 @pytest.fixture(scope="session")
-def mitie_feature_extractor(
-    component_builder: ComponentBuilder, default_config: RasaNLUModelConfig
-):
+def mitie_feature_extractor(component_builder: ComponentBuilder, blank_config):
     mitie_nlp_config = {"name": "MitieNLP"}
-    return component_builder.create_component(
-        mitie_nlp_config, default_config
-    ).extractor
+    return component_builder.create_component(mitie_nlp_config, blank_config).extractor
 
 
 @pytest.fixture(scope="session")
-def default_config() -> RasaNLUModelConfig:
+def blank_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig({"language": "en", "pipeline": []})
 
 
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 02fd54764ba3..69a634bba50f 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -12,12 +12,12 @@
         [
             {"name": "WhitespaceTokenizer"},
             {"name": "CountVectorsFeaturizer"},
-            {"name": "ResponseSelector", EPOCHS: 2},
+            {"name": "ResponseSelector", EPOCHS: 1},
         ],
         [
             {"name": "WhitespaceTokenizer"},
             {"name": "CountVectorsFeaturizer"},
-            {"name": "DIETSelector", EPOCHS: 2},
+            {"name": "DIETSelector", EPOCHS: 1},
         ],
     ],
 )
diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py
index 44845d7460a3..5f86b79e9653 100644
--- a/tests/nlu/test_components.py
+++ b/tests/nlu/test_components.py
@@ -63,12 +63,12 @@ def test_find_unavailable_packages():
     assert unavailable == {"my_made_up_package_name", "foo_bar"}
 
 
-def test_builder_create_by_module_path(component_builder, default_config):
+def test_builder_create_by_module_path(component_builder, blank_config):
     from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     path = "rasa.nlu.featurizers.sparse_featurizer.regex_featurizer.RegexFeaturizer"
     component_config = {"name": path}
-    component = component_builder.create_component(component_config, default_config)
+    component = component_builder.create_component(component_config, blank_config)
     assert type(component) == RegexFeaturizer
 
 
@@ -85,12 +85,12 @@ def test_builder_create_by_module_path(component_builder, default_config):
     ],
 )
 def test_create_component_exception_messages(
-    component_builder, default_config, test_input, expected_output, error
+    component_builder, blank_config, test_input, expected_output, error
 ):
 
     with pytest.raises(error):
         component_config = {"name": test_input}
-        component_builder.create_component(component_config, default_config)
+        component_builder.create_component(component_config, blank_config)
 
 
 def test_builder_load_unknown(component_builder):
diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py
index 38b0c9a05859..3a30a18f6bb7 100644
--- a/tests/nlu/test_config.py
+++ b/tests/nlu/test_config.py
@@ -1,9 +1,12 @@
 import json
 import tempfile
+import os
 from typing import Text
 
 import pytest
 
+import rasa.utils.io as io_utils
+from nlu.config import RasaNLUModelConfig
 from rasa.nlu import config
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.registry import registered_pipeline_templates
@@ -12,18 +15,21 @@
 from tests.nlu.utilities import write_file_config
 
 
-def test_blank_config(default_config):
+def test_blank_config(blank_config):
     file_config = {}
     f = write_file_config(file_config)
     final_config = config.load(f.name)
-    assert final_config.as_dict() == default_config.as_dict()
+
+    assert final_config.as_dict() == blank_config.as_dict()
 
 
 def test_invalid_config_json():
     file_config = """pipeline: [pretrained_embeddings_spacy"""  # invalid yaml
+
     with tempfile.NamedTemporaryFile("w+", suffix="_tmp_config_file.json") as f:
         f.write(file_config)
         f.flush()
+
         with pytest.raises(config.InvalidConfigError):
             config.load(f.name)
 
@@ -31,6 +37,7 @@ def test_invalid_config_json():
 def test_invalid_pipeline_template():
     args = {"pipeline": "my_made_up_name"}
     f = write_file_config(args)
+
     with pytest.raises(config.InvalidConfigError) as execinfo:
         config.load(f.name)
     assert "unknown pipeline template" in str(execinfo.value)
@@ -38,7 +45,7 @@ def test_invalid_pipeline_template():
 
 def test_invalid_many_tokenizers_in_config():
     nlu_config = {
-        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyTokenizer"}],
+        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyTokenizer"}]
     }
 
     with pytest.raises(config.InvalidConfigError) as execinfo:
@@ -46,37 +53,31 @@ def test_invalid_many_tokenizers_in_config():
     assert "More then one tokenizer is used" in str(execinfo.value)
 
 
-def test_invalid_requred_components_in_config():
-    spacy_config = {
-        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyFeaturizer"}],
-    }
-    convert_config = {
-        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}],
-    }
-    lm_config = {
-        "pipeline": [
-            {"name": "ConveRTTokenizer"},
-            {"name": "LanguageModelFeaturizer"},
-        ],
-    }
-    count_vectors_config = {
-        "pipeline": [{"name": "CountVectorsFeaturizer"}],
-    }
-
-    with pytest.raises(config.InvalidConfigError) as execinfo:
-        Trainer(config.RasaNLUModelConfig(spacy_config))
-    assert "Add required components to the pipeline" in str(execinfo.value)
-
+@pytest.mark.parametrize(
+    "_config",
+    [
+        {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyFeaturizer"}]},
+        {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}]},
+        {
+            "pipeline": [
+                {"name": "ConveRTTokenizer"},
+                {"name": "LanguageModelFeaturizer"},
+            ]
+        },
+    ],
+)
+def test_missing_required_component(_config):
     with pytest.raises(config.InvalidConfigError) as execinfo:
-        Trainer(config.RasaNLUModelConfig(convert_config))
+        Trainer(config.RasaNLUModelConfig(_config))
     assert "Add required components to the pipeline" in str(execinfo.value)
 
-    with pytest.raises(config.InvalidConfigError) as execinfo:
-        Trainer(config.RasaNLUModelConfig(lm_config))
-    assert "Add required components to the pipeline" in str(execinfo.value)
 
+@pytest.mark.parametrize(
+    "_config", [{"pipeline": [{"name": "CountVectorsFeaturizer"}]}]
+)
+def test_missing_property(_config):
     with pytest.raises(config.InvalidConfigError) as execinfo:
-        Trainer(config.RasaNLUModelConfig(count_vectors_config)).train(TrainingData())
+        Trainer(config.RasaNLUModelConfig(_config)).train(TrainingData())
     assert "Missing property" in str(execinfo.value)
 
 
@@ -86,6 +87,7 @@ def test_invalid_requred_components_in_config():
 def test_pipeline_registry_lookup(pipeline_template: Text):
     args = {"pipeline": pipeline_template}
     f = write_file_config(args)
+
     final_config = config.load(f.name)
     components = [c for c in final_config.pipeline]
 
@@ -99,43 +101,78 @@ def test_default_config_file():
     assert len(final_config) > 1
 
 
-def test_set_attr_on_component(pretrained_embeddings_spacy_config):
-    idx_classifier = pretrained_embeddings_spacy_config.component_names.index(
-        "SklearnIntentClassifier"
-    )
-    idx_tokenizer = pretrained_embeddings_spacy_config.component_names.index(
-        "SpacyTokenizer"
+def test_set_attr_on_component():
+    _config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer"},
+                {"name": "RegexFeaturizer"},
+                {"name": "CRFEntityExtractor"},
+                {"name": "EntitySynonymMapper"},
+                {"name": "SklearnIntentClassifier"},
+            ],
+        }
     )
-    pretrained_embeddings_spacy_config.set_component_attr(idx_classifier, C=324)
+    idx_classifier = _config.component_names.index("SklearnIntentClassifier")
+    idx_tokenizer = _config.component_names.index("SpacyTokenizer")
 
-    assert pretrained_embeddings_spacy_config.for_component(idx_tokenizer) == {
-        "name": "SpacyTokenizer"
-    }
-    assert pretrained_embeddings_spacy_config.for_component(idx_classifier) == {
+    _config.set_component_attr(idx_classifier, C=324)
+
+    assert _config.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"}
+    assert _config.for_component(idx_classifier) == {
         "name": "SklearnIntentClassifier",
         "C": 324,
     }
 
 
-def test_override_defaults_supervised_embeddings_pipeline(supervised_embeddings_config):
+def test_override_defaults_supervised_embeddings_pipeline():
     builder = ComponentBuilder()
 
-    idx_featurizer = supervised_embeddings_config.component_names.index(
-        "CountVectorsFeaturizer"
-    )
-    idx_classifier = supervised_embeddings_config.component_names.index(
-        "EmbeddingIntentClassifier"
+    _config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer", "pooling": "max"},
+                {"name": "SklearnIntentClassifier", "max_cross_validation_folds": 1},
+            ],
+        }
     )
 
-    config_featurizer = supervised_embeddings_config.for_component(idx_featurizer)
-    config_classifier = supervised_embeddings_config.for_component(idx_classifier)
+    idx_featurizer = _config.component_names.index("SpacyFeaturizer")
+    idx_classifier = _config.component_names.index("SklearnIntentClassifier")
 
     component1 = builder.create_component(
-        config_featurizer, supervised_embeddings_config
+        _config.for_component(idx_featurizer), _config
     )
-    assert component1.max_ngram == 1
+    assert component1.component_config["pooling"] == "max"
 
     component2 = builder.create_component(
-        config_classifier, supervised_embeddings_config
+        _config.for_component(idx_classifier), _config
     )
-    assert component2.component_config["epochs"] == 3
+    assert component2.component_config["max_cross_validation_folds"] == 1
+
+
+def config_files_in(config_directory: Text):
+    return [
+        os.path.join(config_directory, f)
+        for f in os.listdir(config_directory)
+        if os.path.isfile(os.path.join(config_directory, f))
+    ]
+
+
+@pytest.mark.parametrize(
+    "config_file",
+    config_files_in("data/configs_for_docs") + config_files_in("docker/configs"),
+)
+def test_train_docker_and_docs_configs(config_file: Text):
+    content = io_utils.read_yaml_file(config_file)
+
+    loaded_config = config.load(config_file)
+
+    assert len(loaded_config.component_names) > 1
+    assert loaded_config.language == content["language"]
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index f8ac3176d115..40d43035a6b4 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -271,11 +271,12 @@ def test_drop_intents_below_freq():
 
 
 def test_run_evaluation(unpacked_trained_moodbot_path):
-    data = DEFAULT_DATA_PATH
-
     result = run_evaluation(
-        data, os.path.join(unpacked_trained_moodbot_path, "nlu"), errors=False
+        DEFAULT_DATA_PATH,
+        os.path.join(unpacked_trained_moodbot_path, "nlu"),
+        errors=False,
     )
+
     assert result.get("intent_evaluation")
     assert result.get("entity_evaluation").get("CRFEntityExtractor")
 
diff --git a/tests/nlu/test_interpreter.py b/tests/nlu/test_interpreter.py
index f83807d4c0c7..0960524a9c4e 100644
--- a/tests/nlu/test_interpreter.py
+++ b/tests/nlu/test_interpreter.py
@@ -18,13 +18,18 @@
 @pytest.mark.parametrize(
     "pipeline_template", list(registry.registered_pipeline_templates.keys())
 )
-async def test_interpreter(pipeline_template, component_builder, tmpdir):
+async def test_interpreter_on_pipeline_templates(
+    pipeline_template, component_builder, tmpdir
+):
     test_data = "data/examples/rasa/demo-rasa.json"
-    _conf = utilities.base_test_conf(pipeline_template)
-    _conf["data"] = test_data
+
+    config = utilities.base_test_conf(pipeline_template)
+    config["data"] = test_data
+
     td = training_data.load_data(test_data)
+
     interpreter = await utilities.interpreter_for(
-        component_builder, "data/examples/rasa/demo-rasa.json", tmpdir.strpath, _conf
+        component_builder, "data/examples/rasa/demo-rasa.json", tmpdir.strpath, config
     )
 
     texts = ["good bye", "i am looking for an indian spot"]
@@ -60,9 +65,10 @@ async def test_interpreter(pipeline_template, component_builder, tmpdir):
         {"rasa_version": "0.14.4"},
         {"rasa_version": "0.15.0a1"},
         {"rasa_version": "1.0.0a1"},
+        {"rasa_version": "1.5.0"},
     ],
 )
-def test_model_not_compatible(metadata):
+def test_model_is_not_compatible(metadata):
     with pytest.raises(rasa.nlu.model.UnsupportedModelError):
         Interpreter.ensure_model_compatibility(metadata)
 
diff --git a/tests/nlu/test_persistor.py b/tests/nlu/test_persistor.py
index 8371060a37bd..47300b61e858 100644
--- a/tests/nlu/test_persistor.py
+++ b/tests/nlu/test_persistor.py
@@ -14,7 +14,7 @@ class Object:
 
 # noinspection PyPep8Naming
 @mock_s3
-async def test_list_method_method_in_AWSPersistor(component_builder, tmpdir):
+async def test_list_method_method_in_AWS_persistor(component_builder, tmpdir):
     # artificially create a persisted model
     _config = utilities.base_test_conf("keyword")
     os.environ["BUCKET_NAME"] = "rasa-test"
@@ -38,7 +38,7 @@ async def test_list_method_method_in_AWSPersistor(component_builder, tmpdir):
 
 # noinspection PyPep8Naming
 @mock_s3
-def test_list_models_method_raise_exeception_in_AWSPersistor():
+def test_list_models_method_raise_exeception_in_AWS_persistor():
     os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
 
     awspersistor = persistor.AWSPersistor("rasa-test")
@@ -75,7 +75,7 @@ def test_s3_private_retrieve_tar():
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_in_GCSPersistor():
+def test_list_models_method_in_GCS_persistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
@@ -97,7 +97,7 @@ def mocked_list_blobs():
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_raise_exeception_in_GCSPersistor():
+def test_list_models_method_raise_exeception_in_GCS_persistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
@@ -117,7 +117,7 @@ def mocked_list_blobs():
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_in_AzurePersistor():
+def test_list_models_method_in_Azure_persistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
@@ -141,7 +141,7 @@ def mocked_list_blobs(container_name, prefix=None):
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_raise_exeception_in_AzurePersistor():
+def test_list_models_method_raise_exeception_in_Azure_persistor():
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {"blob_name": ("project",)}[
             x
diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index 9037d0c7971d..ce77272f28cb 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -1,19 +1,16 @@
 import os
-
 import pytest
 
 from rasa.nlu import registry, train
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter, Trainer
-from rasa.nlu.train import create_persistor
 from rasa.nlu.training_data import TrainingData
 from rasa.utils.tensorflow.constants import EPOCHS
-from tests.nlu import utilities
 from tests.nlu.conftest import DEFAULT_DATA_PATH
 
 
 def as_pipeline(*components):
-    return [{"name": c, EPOCHS: 2} for c in components]
+    return [{"name": c, EPOCHS: 1} for c in components]
 
 
 def pipelines_for_tests():
@@ -51,17 +48,8 @@ def pipelines_for_tests():
                 "SpacyNLP",
                 "SpacyTokenizer",
                 "SpacyFeaturizer",
-                "RegexFeaturizer",
-                "LexicalSyntacticFeaturizer",
-                "CountVectorsFeaturizer",
-                "CRFEntityExtractor",
-                "DucklingHTTPExtractor",
                 "SpacyEntityExtractor",
                 "SklearnIntentClassifier",
-                "DIETClassifier",
-                "ResponseSelector",
-                "DIETSelector",
-                "EntitySynonymMapper",
             ),
         ),
         (
@@ -70,48 +58,18 @@ def pipelines_for_tests():
                 "HFTransformersNLP",
                 "LanguageModelTokenizer",
                 "LanguageModelFeaturizer",
-                "RegexFeaturizer",
-                "LexicalSyntacticFeaturizer",
-                "CountVectorsFeaturizer",
-                "CRFEntityExtractor",
-                "DucklingHTTPExtractor",
-                "DIETClassifier",
-                "ResponseSelector",
-                "DIETSelector",
-                "EntitySynonymMapper",
-            ),
-        ),
-        (
-            "en",
-            as_pipeline(
-                "ConveRTTokenizer",
-                "ConveRTFeaturizer",
-                "RegexFeaturizer",
-                "LexicalSyntacticFeaturizer",
-                "CountVectorsFeaturizer",
-                "CRFEntityExtractor",
-                "DucklingHTTPExtractor",
                 "DIETClassifier",
-                "ResponseSelector",
-                "DIETSelector",
-                "EntitySynonymMapper",
             ),
         ),
+        ("en", as_pipeline("ConveRTTokenizer", "ConveRTFeaturizer", "DIETClassifier")),
         (
             "en",
             as_pipeline(
                 "MitieNLP",
                 "MitieTokenizer",
                 "MitieFeaturizer",
-                "RegexFeaturizer",
-                "CountVectorsFeaturizer",
                 "MitieEntityExtractor",
-                "DucklingHTTPExtractor",
                 "MitieIntentClassifier",
-                "DIETClassifier",
-                "ResponseSelector",
-                "DIETSelector",
-                "EntitySynonymMapper",
             ),
         ),
         (
@@ -120,14 +78,8 @@ def pipelines_for_tests():
                 "MitieNLP",
                 "JiebaTokenizer",
                 "MitieFeaturizer",
-                "RegexFeaturizer",
-                "CountVectorsFeaturizer",
                 "MitieEntityExtractor",
                 "MitieIntentClassifier",
-                "DIETClassifier",
-                "ResponseSelector",
-                "DIETSelector",
-                "EntitySynonymMapper",
             ),
         ),
     ]
@@ -136,102 +88,65 @@ def pipelines_for_tests():
 def test_all_components_are_in_at_least_one_test_pipeline():
     """There is a template that includes all components to
     test the train-persist-load-use cycle. Ensures that
-    really all Components are in there."""
+    really all components are in there."""
 
     all_components = [c["name"] for _, p in pipelines_for_tests() for c in p]
+
     for cls in registry.component_classes:
         assert (
             cls.name in all_components
         ), "`all_components` template is missing component."
 
 
-@pytest.mark.parametrize(
-    "pipeline_template", list(registry.registered_pipeline_templates.keys())
-)
-async def test_train_model(pipeline_template, component_builder, tmpdir):
-    _config = utilities.base_test_conf(pipeline_template)
+@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
+async def test_train_persist_load_parse(language, pipeline, component_builder, tmpdir):
+    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
+
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
-    assert trained.pipeline
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
 
+    assert trained.pipeline
 
-async def test_random_seed(component_builder, tmpdir, supervised_embeddings_config):
-    """test if train result is the same for two runs of tf embedding"""
+    loaded = Interpreter.load(persisted_path, component_builder)
 
-    # set fixed random seed
-    idx = supervised_embeddings_config.component_names.index(
-        "EmbeddingIntentClassifier"
-    )
-    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
-    idx = supervised_embeddings_config.component_names.index("CRFEntityExtractor")
-    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
-
-    # first run
-    (trained_a, _, persisted_path_a) = await train(
-        supervised_embeddings_config,
-        path=tmpdir.strpath + "_a",
-        data=DEFAULT_DATA_PATH,
-        component_builder=component_builder,
-    )
-    # second run
-    (trained_b, _, persisted_path_b) = await train(
-        supervised_embeddings_config,
-        path=tmpdir.strpath + "_b",
-        data=DEFAULT_DATA_PATH,
-        component_builder=component_builder,
-    )
-    loaded_a = Interpreter.load(persisted_path_a, component_builder)
-    loaded_b = Interpreter.load(persisted_path_b, component_builder)
-    result_a = loaded_a.parse("hello")["intent"]["confidence"]
-    result_b = loaded_b.parse("hello")["intent"]["confidence"]
-    assert result_a == result_b
+    assert loaded.pipeline
+    assert loaded.parse("Rasa is great!") is not None
 
 
 @pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-async def test_train_model_on_test_pipelines(
-    language, pipeline, component_builder, tmpdir
-):
+def test_train_model_without_data(language, pipeline, component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    (trained, _, persisted_path) = await train(
-        _config,
-        path=tmpdir.strpath,
-        data=DEFAULT_DATA_PATH,
-        component_builder=component_builder,
-    )
-    assert trained.pipeline
+
+    trainer = Trainer(_config, component_builder)
+    trainer.train(TrainingData())
+    persisted_path = trainer.persist(tmpdir.strpath)
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
+    assert loaded.parse("Rasa is great!") is not None
 
 
 @pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-async def test_train_model_no_events(language, pipeline, component_builder, tmpdir):
+def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    (trained, _, persisted_path) = await train(
-        _config,
-        path=tmpdir.strpath,
-        data="./data/test/demo-rasa-noents.json",
-        component_builder=component_builder,
-    )
-    assert trained.pipeline
+
+    trainer = Trainer(_config, component_builder)
+    persisted_path = trainer.persist(tmpdir.strpath)
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
+    assert loaded.parse("Rasa is great!") is not None
 
 
 async def test_train_model_empty_pipeline(component_builder):
-    # Should return an empty pipeline
-    _config = utilities.base_test_conf(pipeline_template=None)
+    _config = RasaNLUModelConfig({"pipeline": None, "language": "en"})
+
     with pytest.raises(ValueError):
         await train(
             _config, data=DEFAULT_DATA_PATH, component_builder=component_builder
@@ -239,14 +154,17 @@ async def test_train_model_empty_pipeline(component_builder):
 
 
 async def test_train_named_model(component_builder, tmpdir):
-    _config = utilities.base_test_conf("keyword")
+    _config = RasaNLUModelConfig({"pipeline": "keyword", "language": "en"})
+
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
+
     assert trained.pipeline
+
     normalized_path = os.path.dirname(os.path.normpath(persisted_path))
     # should be saved in a dir named after a project
     assert normalized_path == tmpdir.strpath
@@ -256,6 +174,7 @@ async def test_handles_pipeline_with_non_existing_component(
     component_builder, pretrained_embeddings_spacy_config
 ):
     pretrained_embeddings_spacy_config.pipeline.append({"name": "my_made_up_component"})
+
     with pytest.raises(Exception) as execinfo:
         await train(
             pretrained_embeddings_spacy_config,
@@ -265,56 +184,39 @@ async def test_handles_pipeline_with_non_existing_component(
     assert "Cannot find class" in str(execinfo.value)
 
 
-@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir):
-    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    trainer = Trainer(_config, component_builder)
-    persistor = create_persistor(_config)
-    persisted_path = trainer.persist(tmpdir.strpath, persistor)
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
-
-
-@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
-    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    trainer = Trainer(_config, component_builder)
-    trainer.train(TrainingData())
-    persistor = create_persistor(_config)
-    persisted_path = trainer.persist(tmpdir.strpath, persistor)
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
-
+async def test_train_model_training_data_persisted(component_builder, tmpdir):
+    _config = RasaNLUModelConfig({"pipeline": "keyword", "language": "en"})
 
-async def test_train_model_no_training_data_persisted(component_builder, tmpdir):
-    _config = utilities.base_test_conf("keyword")
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
-        persist_nlu_training_data=False,
+        persist_nlu_training_data=True,
     )
+
     assert trained.pipeline
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.model_metadata.get("training_data") is None
+    assert loaded.model_metadata.get("training_data") is not None
 
 
-async def test_train_model_training_data_persisted(component_builder, tmpdir):
-    _config = utilities.base_test_conf("keyword")
+async def test_train_model_no_training_data_persisted(component_builder, tmpdir):
+    _config = RasaNLUModelConfig({"pipeline": "keyword", "language": "en"})
+
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
-        persist_nlu_training_data=True,
+        persist_nlu_training_data=False,
     )
+
     assert trained.pipeline
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.model_metadata.get("training_data") is not None
+    assert loaded.model_metadata.get("training_data") is None
diff --git a/tests/nlu/utilities.py b/tests/nlu/utilities.py
index 2f43cb55fcef..9c168370c385 100644
--- a/tests/nlu/utilities.py
+++ b/tests/nlu/utilities.py
@@ -1,10 +1,18 @@
 import tempfile
-
 import ruamel.yaml as yaml
 
+from typing import Text
+
+import rasa.utils.io as io_utils
+
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter
 from rasa.nlu.train import train
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.selectors.diet_selector import DIETSelector
+from rasa.nlu.selectors.response_selector import ResponseSelector
+from rasa.utils.tensorflow.constants import EPOCHS
 
 
 def base_test_conf(pipeline_template):
diff --git a/tests/test_train.py b/tests/test_train.py
index d0bb3194ff39..7a71ae0370e5 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -9,7 +9,6 @@
 
 from rasa.train import train_core, train_nlu, train
 from tests.core.test_model import _fingerprint
-from tests.utilities import update_number_of_epochs
 
 
 @pytest.mark.parametrize(
@@ -130,29 +129,3 @@ def test_train_nlu_temp_files(
     )
 
     assert count_temp_rasa_files(tempfile.tempdir) == 0
-
-
-def config_files_in(config_directory: Text):
-    return [
-        os.path.join(config_directory, f)
-        for f in os.listdir(config_directory)
-        if os.path.isfile(os.path.join(config_directory, f))
-    ]
-
-
-@pytest.mark.parametrize(
-    "config_file",
-    config_files_in("data/configs_for_docs") + config_files_in("docker/configs"),
-)
-def test_train_docker_and_docs_configs(
-    config_file: Text, tmp_path: Text, default_nlu_data: Text
-):
-    output = str(tmp_path)
-    tmp_config_file = os.path.join(output, "config.yml")
-
-    update_number_of_epochs(config_file, tmp_config_file)
-
-    train_nlu(tmp_config_file, default_nlu_data, output=output)
-
-    files = os.listdir(output)
-    assert any([f.startswith("nlu") and f.endswith("tar.gz") for f in files])
diff --git a/tests/utilities.py b/tests/utilities.py
index 118af373919d..6aa53fed8859 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -1,7 +1,8 @@
-from typing import Text
 from yarl import URL
+from typing import Text
 
 import rasa.utils.io as io_utils
+
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.selectors.diet_selector import DIETSelector

From 6a205ac9ee8a6c44b44bc253a151d976406bc1ab Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:22:16 +0100
Subject: [PATCH 517/633] update diet classifier test

---
 tests/nlu/classifiers/test_diet_classifier.py | 45 ++++++++++---------
 tests/nlu/selectors/test_selectors.py         |  4 +-
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index a32e1c257891..f5f98b7ccca3 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -91,16 +91,18 @@ def test_check_labels_features_exist(messages, expected):
             },
             {"name": "CountVectorsFeaturizer"},
             {"name": "ConveRTFeaturizer"},
-            {"name": "DIETClassifier", EPOCHS: 3},
+            {"name": "DIETClassifier", EPOCHS: 1},
         ],
         [
             {"name": "WhitespaceTokenizer"},
             {"name": "CountVectorsFeaturizer"},
-            {"name": "DIETClassifier", LOSS_TYPE: "margin", EPOCHS: 3},
+            {"name": "DIETClassifier", LOSS_TYPE: "margin", EPOCHS: 1},
         ],
     ],
 )
-async def test_train_persist_load(pipeline, component_builder, tmpdir):
+async def test_train_persist_load_with_different_settings(
+    pipeline, component_builder, tmpdir
+):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
 
     (trainer, trained, persisted_path) = await train(
@@ -116,10 +118,7 @@ async def test_train_persist_load(pipeline, component_builder, tmpdir):
     loaded = Interpreter.load(persisted_path, component_builder)
 
     assert loaded.pipeline
-    assert loaded.parse("hello") == trained.parse("hello")
-    assert loaded.parse("Hello today is Monday, again!") == trained.parse(
-        "Hello today is Monday, again!"
-    )
+    assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!")
 
 
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
@@ -129,7 +128,7 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
         {
             "pipeline": [
                 {"name": "WhitespaceTokenizer"},
-                {"name": "DIETClassifier", EPOCHS: 3},
+                {"name": "DIETClassifier", EPOCHS: 1},
             ],
             "language": "en",
         }
@@ -157,31 +156,31 @@ def as_pipeline(*components):
     "classifier_params, data_path, output_length, output_should_sum_to_1",
     [
         (
-            {RANDOM_SEED: 42, EPOCHS: 3},
+            {RANDOM_SEED: 42, EPOCHS: 1},
             "data/test/many_intents.md",
             10,
             True,
         ),  # default config
         (
-            {RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 3},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 1},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # no normalization
         (
-            {RANDOM_SEED: 42, RANKING_LENGTH: 3, EPOCHS: 3},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 3, EPOCHS: 1},
             "data/test/many_intents.md",
             3,
             True,
         ),  # lower than default ranking_length
         (
-            {RANDOM_SEED: 42, RANKING_LENGTH: 12, EPOCHS: 3},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 12, EPOCHS: 1},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # higher than default ranking_length
         (
-            {RANDOM_SEED: 42, EPOCHS: 3},
+            {RANDOM_SEED: 42, EPOCHS: 1},
             "examples/moodbot/data/nlu.md",
             7,
             True,
@@ -228,7 +227,7 @@ async def test_softmax_normalization(
 
 @pytest.mark.parametrize(
     "classifier_params, output_length",
-    [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 3}, LABEL_RANKING_LENGTH)],
+    [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1}, LABEL_RANKING_LENGTH)],
 )
 async def test_margin_loss_is_not_normalized(
     monkeypatch, component_builder, tmpdir, classifier_params, output_length
@@ -264,16 +263,20 @@ async def test_margin_loss_is_not_normalized(
     assert parse_data.get("intent") == intent_ranking[0]
 
 
-async def test_random_seed(component_builder, tmpdir, supervised_embeddings_config):
+async def test_set_random_seed(component_builder, tmpdir, supervised_embeddings_config):
     """test if train result is the same for two runs of tf embedding"""
 
     # set fixed random seed
-    idx = supervised_embeddings_config.component_names.index(
-        "EmbeddingIntentClassifier"
+    _config = RasaNLUModelConfig(
+        {
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "DIETClassifier", RANDOM_SEED: 1, EPOCHS: 1},
+            ],
+            "language": "en",
+        }
     )
-    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
-    idx = supervised_embeddings_config.component_names.index("CRFEntityExtractor")
-    supervised_embeddings_config.set_component_attr(idx, random_seed=1)
 
     # first run
     (trained_a, _, persisted_path_a) = await train(
@@ -289,8 +292,10 @@ async def test_random_seed(component_builder, tmpdir, supervised_embeddings_conf
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
+
     loaded_a = Interpreter.load(persisted_path_a, component_builder)
     loaded_b = Interpreter.load(persisted_path_b, component_builder)
     result_a = loaded_a.parse("hello")["intent"]["confidence"]
     result_b = loaded_b.parse("hello")["intent"]["confidence"]
+
     assert result_a == result_b
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 8eae54ff4654..fbce352a6b8b 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -17,6 +17,7 @@
     ],
 )
 def test_train_selector(pipeline, component_builder, tmpdir):
+    # use data that include some responses
     td = load_data("data/examples/rasa/demo-rasa.md")
     td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
     td = td.merge(td_responses)
@@ -30,7 +31,8 @@ def test_train_selector(pipeline, component_builder, tmpdir):
     persisted_path = trainer.persist(tmpdir)
 
     assert trainer.pipeline
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
     assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None

From 5c8ed35c6562a6fd892debd9991881cf60da037f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:30:27 +0100
Subject: [PATCH 518/633] clean up

---
 tests/conftest.py                             | 2 +-
 tests/nlu/classifiers/test_diet_classifier.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 702f694cfcba..b7d854c28dba 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -190,7 +190,7 @@ async def trained_core_model(
 async def trained_nlu_model(
     trained_async,
     default_domain_path: Text,
-    blank_config,
+    default_config: List[Policy],
     default_nlu_data: Text,
     default_stories_file: Text,
 ) -> Text:
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index f5f98b7ccca3..87b660ea1355 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -12,6 +12,7 @@
     RANDOM_SEED,
     RANKING_LENGTH,
     EPOCHS,
+    MASKED_LM,
 )
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.model import Interpreter
@@ -91,7 +92,7 @@ def test_check_labels_features_exist(messages, expected):
             },
             {"name": "CountVectorsFeaturizer"},
             {"name": "ConveRTFeaturizer"},
-            {"name": "DIETClassifier", EPOCHS: 1},
+            {"name": "DIETClassifier", MASKED_LM: True, EPOCHS: 1},
         ],
         [
             {"name": "WhitespaceTokenizer"},

From 33015fd704d5ba9c456c604ae086e4102c50f0d3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:37:57 +0100
Subject: [PATCH 519/633] update example configs

---
 examples/concertbot/config.yml       | 11 ++++++++++-
 examples/formbot/config.yml          |  5 ++---
 examples/knowledgebasebot/config.yml | 12 +++++++++++-
 examples/moodbot/config.yml          |  8 +++++++-
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/examples/concertbot/config.yml b/examples/concertbot/config.yml
index 39cbec66c118..2f46ed015b68 100644
--- a/examples/concertbot/config.yml
+++ b/examples/concertbot/config.yml
@@ -1,6 +1,15 @@
 language: en
 
-pipeline: supervised_embeddings
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "RegexFeaturizer"
+  - name: "CountVectorsFeaturizer"
+  - name: "CountVectorsFeaturizer"
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: "DIETClassifier"
+  - name: "EntitySynonymMapper"
 
 policies:
   - name: EmbeddingPolicy
diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index f9eb4ff0ff7b..7b6e73188076 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,15 +2,14 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
-  - name: CRFEntityExtractor
-  - name: EntitySynonymMapper
   - name: CountVectorsFeaturizer
     token_pattern: (?u)\b\w+\b
-  - name: DIETClassifier
   - name: DucklingHTTPExtractor
     url: http://localhost:8000
     dimensions:
       - number
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
 
 policies:
   - name: FallbackPolicy
diff --git a/examples/knowledgebasebot/config.yml b/examples/knowledgebasebot/config.yml
index 00e51f7ac3a3..06a094b32b73 100644
--- a/examples/knowledgebasebot/config.yml
+++ b/examples/knowledgebasebot/config.yml
@@ -1,5 +1,15 @@
 language: en
-pipeline: supervised_embeddings
+
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "RegexFeaturizer"
+  - name: "CountVectorsFeaturizer"
+  - name: "CountVectorsFeaturizer"
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: "DIETClassifier"
+  - name: "EntitySynonymMapper"
 
 policies:
   - name: MemoizationPolicy
diff --git a/examples/moodbot/config.yml b/examples/moodbot/config.yml
index 2378258b9730..d78149cea8f4 100644
--- a/examples/moodbot/config.yml
+++ b/examples/moodbot/config.yml
@@ -1,6 +1,12 @@
 language: en
 
-pipeline: "pretrained_embeddings_spacy"
+pipeline:
+  - name: "SpacyNLP"
+  - name: "SpacyTokenizer"
+  - name: "SpacyFeaturizer"
+  - name: "RegexFeaturizer"
+  - name: "DIETClassifier"
+  - name: "EntitySynonymMapper"
 
 policies:
   - name: EmbeddingPolicy

From 63f5f69ad68493455aeedc69cd9156844eb28c4c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:41:43 +0100
Subject: [PATCH 520/633] reduce number of train epochs

---
 tests/utilities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utilities.py b/tests/utilities.py
index ea6ccf9004d9..913be3ea672d 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -33,6 +33,6 @@ def update_number_of_epochs(config_path: Text, output_file: Text):
             DIETClassifier.name,
             ResponseSelector.name,
         ]:
-            component[EPOCHS] = 2
+            component[EPOCHS] = 1
 
     io_utils.write_yaml_file(config, output_file)

From eb81127c14a54cba21814541e36ca330e84a2845 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:53:56 +0100
Subject: [PATCH 521/633] fix random seed test

---
 tests/cli/conftest.py                         | 7 ++-----
 tests/nlu/classifiers/test_diet_classifier.py | 6 +++---
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
index 64b6f8ba1154..c65108b642ac 100644
--- a/tests/cli/conftest.py
+++ b/tests/cli/conftest.py
@@ -36,6 +36,7 @@ def init_default_project(tmpdir_factory: TempdirFactory) -> str:
     os.environ["LOG_LEVEL"] = "ERROR"
 
     check_call(["rasa", "init", "--no-prompt"], cwd=path)
+
     return path
 
 
@@ -90,11 +91,7 @@ def _set_up_initial_project(testdir: Testdir):
     write_yaml_file(
         {
             "language": "en",
-            "pipeline": [
-                {"name": "WhitespaceTokenizer"},
-                {"name": "CountVectorsFeaturizer"},
-                {"name": "KeywordIntentClassifier"},
-            ],
+            "pipeline": [{"name": "KeywordIntentClassifier"}],
             "policies": [
                 {"name": "MappingPolicy"},
                 {"name": "MemoizationPolicy", "max_history": 5},
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 87b660ea1355..f65dd2844f81 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -264,7 +264,7 @@ async def test_margin_loss_is_not_normalized(
     assert parse_data.get("intent") == intent_ranking[0]
 
 
-async def test_set_random_seed(component_builder, tmpdir, supervised_embeddings_config):
+async def test_set_random_seed(component_builder, tmpdir):
     """test if train result is the same for two runs of tf embedding"""
 
     # set fixed random seed
@@ -281,14 +281,14 @@ async def test_set_random_seed(component_builder, tmpdir, supervised_embeddings_
 
     # first run
     (trained_a, _, persisted_path_a) = await train(
-        supervised_embeddings_config,
+        _config,
         path=tmpdir.strpath + "_a",
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
     # second run
     (trained_b, _, persisted_path_b) = await train(
-        supervised_embeddings_config,
+        _config,
         path=tmpdir.strpath + "_b",
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,

From f1cc9a7f1de5a6b8bed722b843892a17f9c058ae Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 13:59:00 +0100
Subject: [PATCH 522/633] raise exception instead of NotImplemented

---
 rasa/core/policies/ted_policy.py        |  1 +
 rasa/nlu/classifiers/diet_classifier.py |  1 +
 rasa/utils/tensorflow/models.py         | 35 ++++++++++++++++---------
 setup.cfg                               |  2 +-
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index bcfee2ce458d..ae386e47d465 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -457,6 +457,7 @@ def load(cls, path: Text) -> "TEDPolicy":
         return cls(featurizer=featurizer, priority=priority, model=model, **meta)
 
 
+# accessing _tf_layers with any key results in key-error, disable it
 # pytype: disable=key-error
 
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 8b6b9f2c97f9..5bf81276a9ab 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -925,6 +925,7 @@ def _load_model(
         return model
 
 
+# accessing _tf_layers with any key results in key-error, disable it
 # pytype: disable=key-error
 
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 7318a6779e7c..41f80571dbc9 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -232,7 +232,7 @@ def train_dataset_function(_batch_size: int) -> tf.data.Dataset:
         )
 
     def _get_tf_evaluation_functions(
-        self, eager: bool, evaluation_model_data: Optional[RasaModelData],
+        self, eager: bool, evaluation_model_data: Optional[RasaModelData]
     ) -> Tuple[Optional[Callable], Optional[Callable]]:
         """Create evaluation tensorflow functions"""
 
@@ -248,10 +248,7 @@ def evaluation_dataset_function(_batch_size: int) -> tf.data.Dataset:
         return (
             evaluation_dataset_function,
             self._get_tf_call_model_function(
-                evaluation_dataset_function,
-                self._total_batch_loss,
-                eager,
-                "evaluation",
+                evaluation_dataset_function, self._total_batch_loss, eager, "evaluation"
             ),
         )
 
@@ -335,22 +332,36 @@ def linearly_increasing_batch_size(
             return int(batch_size[0])
 
     def compile(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
 
     def evaluate(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
 
     def test_on_batch(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
 
     def predict_on_batch(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
 
     def fit_generator(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
 
     def evaluate_generator(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
 
     def predict_generator(self, *args, **kwargs) -> None:
-        raise NotImplemented
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
diff --git a/setup.cfg b/setup.cfg
index 9bd1c94f20ec..6f91f10e9442 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,4 +24,4 @@ license_file = LICENSE.txt
 
 [flake8]
 max-line-length = 88
-ignore = W503, E121, E126, E211, E225, E501, E203, E402, F401, F811, E231, F901
+ignore = W503, E121, E126, E211, E225, E501, E203, E402, F401, F811, E231

From 989f5fddbebd2a0e0f920088d1474ce973eabc1c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 14:06:53 +0100
Subject: [PATCH 523/633] added mitie docker image again

---
 changelog/5266.improvement.rst                | 21 +++++
 changelog/5266.removal.rst                    | 21 -----
 .../Dockerfile_pretrained_embeddings_mitie_en | 89 +++++++++++++++++++
 .../config_pretrained_embeddings_mitie.yml    | 11 +++
 4 files changed, 121 insertions(+), 21 deletions(-)
 delete mode 100644 changelog/5266.removal.rst
 create mode 100644 docker/Dockerfile_pretrained_embeddings_mitie_en
 create mode 100644 docker/configs/config_pretrained_embeddings_mitie.yml

diff --git a/changelog/5266.improvement.rst b/changelog/5266.improvement.rst
index bff559283a77..492b1afd5bcf 100644
--- a/changelog/5266.improvement.rst
+++ b/changelog/5266.improvement.rst
@@ -1 +1,22 @@
 We updated our code to TensorFlow 2.
+
+We added a new docker image for ConveRT.
+The new images uses the following configuration
+
+```
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
+```
\ No newline at end of file
diff --git a/changelog/5266.removal.rst b/changelog/5266.removal.rst
deleted file mode 100644
index 0afb018ffc41..000000000000
--- a/changelog/5266.removal.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-We replaced the MITIE Docker image with a Docker image that uses ConveRT.
-
-The new images uses the following configuration
-
-```
-language: "en"
-
-pipeline:
-  - name: ConveRTTokenizer
-  - name: ConveRTFeaturizer
-  - name: RegexFeaturizer
-  - name: LexicalSyntacticFeaturizer
-  - name: CountVectorsFeaturizer
-  - name: CountVectorsFeaturizer
-    analyzer: "char_wb"
-    min_ngram: 1
-    max_ngram: 4
-  - name: DIETClassifier
-  - name: EntitySynonymMapper
-  - name: ResponseSelector
-```
\ No newline at end of file
diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_mitie_en
new file mode 100644
index 000000000000..f0c60555bde2
--- /dev/null
+++ b/docker/Dockerfile_pretrained_embeddings_mitie_en
@@ -0,0 +1,89 @@
+# Create common base stage
+FROM python:3.6-slim as base
+
+WORKDIR /build
+
+# Create virtualenv to isolate builds
+RUN python -m venv /build
+
+# Install common libraries
+RUN apt-get update -qq \
+ && apt-get install -y --no-install-recommends \
+    # required by psycopg2 at build and runtime
+    libpq-dev \
+     # required for health check
+    curl \
+ && apt-get autoremove -y
+
+# Make sure we use the virtualenv
+ENV PATH="/build/bin:$PATH"
+
+# Stage to build and install everything
+FROM base as builder
+
+WORKDIR /src
+
+# Install all required build libraries
+RUN apt-get update -qq \
+ && apt-get install -y --no-install-recommends \
+    build-essential \
+    wget \
+    openssh-client \
+    graphviz-dev \
+    pkg-config \
+    git-core \
+    openssl \
+    libssl-dev \
+    libffi6 \
+    libffi-dev \
+    libpng-dev
+
+# Make sure we have the latest pip version
+RUN pip install -U pip
+
+# Download mitie model
+RUN wget -P /app/data/ https://s3-eu-west-1.amazonaws.com/mitie/total_word_feature_extractor.dat
+
+# Copy only what we really need
+COPY README.md .
+COPY setup.py .
+COPY setup.cfg .
+COPY MANIFEST.in .
+COPY alt_requirements/ ./alt_requirements
+COPY requirements.txt .
+COPY LICENSE.txt .
+
+# Install dependencies
+RUN pip install --no-cache-dir -r alt_requirements/requirements_pretrained_embeddings_mitie.txt
+
+# Install Rasa as package
+COPY rasa ./rasa
+RUN pip install .[sql,mitie]
+
+# Runtime stage which uses the virtualenv which we built in the previous stage
+FROM base AS runner
+
+WORKDIR /app
+
+# Copy over default pipeline config
+COPY sample_configs/config_pretrained_embeddings_mitie.yml config.yml
+
+# Copy over mitie model
+COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
+
+# Copy virtualenv from previous stage
+COPY --from=builder /build /build
+
+# Create a volume for temporary data
+VOLUME /tmp
+
+# Make sure the default group has the same permissions as the owner
+RUN chgrp -R 0 . && chmod -R g=u .
+
+# Don't run as root
+USER 1001
+
+EXPOSE 5005
+
+ENTRYPOINT ["rasa"]
+CMD ["--help"]
\ No newline at end of file
diff --git a/docker/configs/config_pretrained_embeddings_mitie.yml b/docker/configs/config_pretrained_embeddings_mitie.yml
new file mode 100644
index 000000000000..84b283d6e75d
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_mitie.yml
@@ -0,0 +1,11 @@
+language: "en"
+
+pipeline:
+  - name: MitieNLP
+    model: "data/total_word_feature_extractor.dat"
+  - name: MitieTokenizer
+  - name: MitieEntityExtractor
+  - name: EntitySynonymMapper
+  - name: RegexFeaturizer
+  - name: MitieFeaturizer
+  - name: SklearnIntentClassifier
\ No newline at end of file

From 95f5fb5401b01d31d3b04d9149cfe94543293c29 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 14:08:55 +0100
Subject: [PATCH 524/633] clean up imports

---
 tests/nlu/utilities.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/nlu/utilities.py b/tests/nlu/utilities.py
index 9c168370c385..f06315263d9b 100644
--- a/tests/nlu/utilities.py
+++ b/tests/nlu/utilities.py
@@ -1,18 +1,9 @@
 import tempfile
 import ruamel.yaml as yaml
 
-from typing import Text
-
-import rasa.utils.io as io_utils
-
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter
 from rasa.nlu.train import train
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from rasa.nlu.selectors.diet_selector import DIETSelector
-from rasa.nlu.selectors.response_selector import ResponseSelector
-from rasa.utils.tensorflow.constants import EPOCHS
 
 
 def base_test_conf(pipeline_template):

From 446ff97a5866e1bf5e440c7fdb993b6572655109 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 14:10:53 +0100
Subject: [PATCH 525/633] update config path in docker file

---
 docker/Dockerfile_pretrained_embeddings_mitie_en | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_mitie_en
index f0c60555bde2..ea9f4682f93b 100644
--- a/docker/Dockerfile_pretrained_embeddings_mitie_en
+++ b/docker/Dockerfile_pretrained_embeddings_mitie_en
@@ -66,7 +66,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_mitie.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_mitie.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat

From 39aeed08d954b70d7b1a0c5cc7064588148cea34 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 21 Feb 2020 14:30:51 +0100
Subject: [PATCH 526/633] make comment start from capital S

---
 rasa/nlu/selectors/response_selector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 1b99d2429b4d..49c4710a6967 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -154,7 +154,7 @@ class ResponseSelector(DIETClassifier):
         # ## Regularization parameters
         # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
-        # sparsity of the weights in dense layers
+        # Sparsity of the weights in dense layers
         WEIGHT_SPARSITY: 0.8,
         # The scale of how important is to minimize the maximum similarity
         # between embeddings of different labels.

From dd6f1c8306e3b810244e4d626dbc7d65249b4efb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 14:30:27 +0100
Subject: [PATCH 527/633] refactor updating EVAL_NUM_EPOCHS

---
 rasa/core/policies/keras_policy.py            | 13 +++---
 rasa/core/policies/ted_policy.py              | 15 ++-----
 rasa/nlu/classifiers/diet_classifier.py       | 18 +++-----
 .../dense_featurizer/convert_featurizer.py    |  1 +
 rasa/utils/tensorflow/constants.py            |  2 +
 rasa/utils/train_utils.py                     | 41 ++++++++++++++++++-
 6 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index ce09a6bf2a59..23717965241e 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -164,24 +164,22 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        # set numpy random seed
+        # set random seed
         np.random.seed(self.random_seed)
+        tf.random.set_seed(self.random_seed)
 
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
         # noinspection PyPep8Naming
         shuffled_X, shuffled_y = training_data.shuffled_X_y()
 
-        tf.random.set_seed(self.random_seed)
-
         if self.model is None:
             self.model = self.model_architecture(
                 shuffled_X.shape[1:], shuffled_y.shape[1:]
             )
 
         logger.debug(
-            "Fitting model with {} total samples and a "
-            "validation split of {}"
-            "".format(training_data.num_examples(), self.validation_split)
+            f"Fitting model with {training_data.num_examples()} total samples and a "
+            f"validation split of {self.validation_split}."
         )
 
         # filter out kwargs that cannot be passed to fit
@@ -199,7 +197,8 @@ def train(
             **self._train_params,
         )
         self.current_epoch = self.epochs
-        logger.info("Done fitting keras policy model")
+
+        logger.debug("Done fitting Keras Policy model.")
 
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index ae386e47d465..a62d127457d1 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -55,6 +55,7 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
+    EVALUATE_ONCE_PER_EPOCH,
 )
 
 
@@ -191,8 +192,8 @@ def __init__(
 
         self.model = model
 
-        self._label_data = None
-        self.data_example = None
+        self._label_data = None  # RasaModelData
+        self.data_example = None  # Dict[Text, List[np.ndarray]]
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
@@ -201,15 +202,7 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = train_utils.check_deprecated_options(self.config)
 
         self.config = train_utils.update_similarity_type(self.config)
-
-        if self.config[EVAL_NUM_EPOCHS] == -1:
-            # magic value -1 is used to set evaluation to number of epochs
-            self.config[EVAL_NUM_EPOCHS] = self.config[EPOCHS]
-        elif self.config[EVAL_NUM_EPOCHS] < 1:
-            raise ValueError(
-                f"'{EVAL_NUM_EXAMPLES}' is set to '{self.config[EVAL_NUM_EPOCHS]}'. "
-                f"Only values > 1 are allowed for this configuration value."
-            )
+        self.config = train_utils.update_evaluation_parameters(self.config)
 
     # data helpers
     # noinspection PyPep8Naming
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 5bf81276a9ab..c8f38f5bb1cb 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -72,6 +72,7 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
+    EVALUATE_ONCE_PER_EPOCH,
 )
 
 
@@ -228,16 +229,9 @@ def _check_config_parameters(self) -> None:
         self.component_config = train_utils.update_similarity_type(
             self.component_config
         )
-
-        if self.component_config[EVAL_NUM_EPOCHS] == -1:
-            # magic value -1 is used to set evaluation to number of epochs
-            self.component_config[EVAL_NUM_EPOCHS] = self.component_config[EPOCHS]
-        elif self.component_config[EVAL_NUM_EPOCHS] < 1:
-            raise ValueError(
-                f"'{EVAL_NUM_EXAMPLES}' is set to "
-                f"'{self.component_config[EVAL_NUM_EPOCHS]}'. "
-                f"Only values > 1 are allowed for this configuration value."
-            )
+        self.component_config = train_utils.update_evaluation_parameters(
+            self.component_config
+        )
 
     # package safety checks
     @classmethod
@@ -265,7 +259,7 @@ def __init__(
         self.model = model
 
         # encode all label_ids with numbers
-        self._label_data = None
+        self._label_data = None  # RasaModelData
 
         # keep the input tuple sizes in self.batch_in
         self.batch_tuple_sizes = batch_tuple_sizes
@@ -273,7 +267,7 @@ def __init__(
         # number of entity tags
         self.num_tags = 0
 
-        self.data_example = None
+        self.data_example = None  # Dict[Text, List[np.ndarray]]
 
     @property
     def label_key(self) -> Optional[Text]:
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index a497c91a6a61..bf99dc932ad2 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -53,6 +53,7 @@ def _load_model(self) -> None:
         import tensorflow_text
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+
         # required to take care of cases when other files are already
         # stored in the default TFHUB_CACHE_DIR
         try:
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 5f59ff23e479..a3425cbc8664 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -52,3 +52,5 @@
 BILOU_FLAG = "BILOU_flag"
 
 RETRIEVAL_INTENT = "retrieval_intent"
+
+EVALUATE_ONCE_PER_EPOCH = -1
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index d799817d45de..1dddf3543b34 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,5 +1,4 @@
 import numpy as np
-import tensorflow as tf
 import logging
 import scipy.sparse
 from typing import Optional, Text, Dict, Any, Union, List
@@ -28,6 +27,8 @@
     DROP_RATE_LABEL,
     NEGATIVE_MARGIN_SCALE,
     DROP_RATE,
+    EPOCHS,
+    EVALUATE_ONCE_PER_EPOCH,
 )
 
 
@@ -51,6 +52,14 @@ def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarr
 
 
 def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """
+    If SIMILARITY_TYPE is set to 'auto', update the SIMILARITY_TYPE depending
+    on the LOSS_TYPE.
+    Args:
+        config: model configuration
+
+    Returns: updated model configuration
+    """
     if config.get(SIMILARITY_TYPE) == "auto":
         if config[LOSS_TYPE] == "softmax":
             config[SIMILARITY_TYPE] = "inner"
@@ -112,6 +121,28 @@ def sequence_to_sentence_features(
     return np.expand_dims(features[-1], axis=0)
 
 
+def update_evaluation_parameters(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """
+    If EVAL_NUM_EPOCHS is set to -1, evaluate at the end of every epoch.
+
+    Args:
+        config: model configuration
+
+    Returns: updated model configuration
+    """
+
+    if config[EVAL_NUM_EPOCHS] == EVALUATE_ONCE_PER_EPOCH:
+        config[EVAL_NUM_EPOCHS] = config[EPOCHS]
+    elif config[EVAL_NUM_EPOCHS] < 1:
+        raise ValueError(
+            f"'{EVAL_NUM_EXAMPLES}' is set to "
+            f"'{config[EVAL_NUM_EPOCHS]}'. "
+            f"Only values > 1 are allowed for this configuration value."
+        )
+
+    return config
+
+
 def _replace_deprecated_option(
     old_option: Text, new_option: Union[Text, List[Text]], config: Dict[Text, Any]
 ) -> Dict[Text, Any]:
@@ -136,6 +167,14 @@ def _replace_deprecated_option(
 
 
 def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """
+    If old model configuration parameters are present in the provided config, replace
+    them with the new parameters and log a warning.
+    Args:
+        config: model configuration
+
+    Returns: updated model configuration
+    """
 
     config = _replace_deprecated_option(
         "hidden_layers_sizes_pre_dial", [HIDDEN_LAYERS_SIZES, DIALOGUE], config

From 90c120327a3d4107d3e502aa77a115997d8bd000 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 14:54:26 +0100
Subject: [PATCH 528/633] fix tests

---
 tests/nlu/test_config.py     | 2 +-
 tests/nlu/test_evaluation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py
index 3a30a18f6bb7..36cba509683a 100644
--- a/tests/nlu/test_config.py
+++ b/tests/nlu/test_config.py
@@ -6,7 +6,7 @@
 import pytest
 
 import rasa.utils.io as io_utils
-from nlu.config import RasaNLUModelConfig
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu import config
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.registry import registered_pipeline_templates
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index 97f1e6c68fc0..bf188af6d04f 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -278,7 +278,7 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
     )
 
     assert result.get("intent_evaluation")
-    assert result.get("entity_evaluation").get("CRFEntityExtractor")
+    assert result.get("entity_evaluation").get("DIETClassifier")
 
 
 def test_run_cv_evaluation(pretrained_embeddings_spacy_config):

From 4d6eb7e3b06697557abe2f2427bd38f9dcf18ddf Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 15:48:38 +0100
Subject: [PATCH 529/633] move pickle dump and load to io utils

---
 .../Dockerfile_pretrained_embeddings_mitie_en |  2 +-
 .../config_pretrained_embeddings_mitie.yml    |  2 +-
 rasa/core/policies/ted_policy.py              | 83 +++++++++----------
 rasa/nlu/classifiers/diet_classifier.py       | 72 +++++++---------
 rasa/utils/io.py                              | 13 +++
 5 files changed, 84 insertions(+), 88 deletions(-)

diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_mitie_en
index ea9f4682f93b..663986b7faf1 100644
--- a/docker/Dockerfile_pretrained_embeddings_mitie_en
+++ b/docker/Dockerfile_pretrained_embeddings_mitie_en
@@ -86,4 +86,4 @@ USER 1001
 EXPOSE 5005
 
 ENTRYPOINT ["rasa"]
-CMD ["--help"]
\ No newline at end of file
+CMD ["--help"]
diff --git a/docker/configs/config_pretrained_embeddings_mitie.yml b/docker/configs/config_pretrained_embeddings_mitie.yml
index 84b283d6e75d..1ff89972039a 100644
--- a/docker/configs/config_pretrained_embeddings_mitie.yml
+++ b/docker/configs/config_pretrained_embeddings_mitie.yml
@@ -8,4 +8,4 @@ pipeline:
   - name: EntitySynonymMapper
   - name: RegexFeaturizer
   - name: MitieFeaturizer
-  - name: SklearnIntentClassifier
\ No newline at end of file
+  - name: SklearnIntentClassifier
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index a62d127457d1..5766c71b6ee3 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import pickle
+from pathlib import Path
 
 import numpy as np
 import tensorflow as tf
@@ -9,7 +10,7 @@
 
 from typing import Any, List, Optional, Text, Dict, Tuple, Union
 
-import rasa.utils.io
+import rasa.utils.io as io_utils
 from rasa.core.domain import Domain
 from rasa.core.featurizers import (
     TrackerFeaturizer,
@@ -306,7 +307,10 @@ def train(
             return
 
         # keep one example for persisting and loading
-        self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
+        self.data_example = {
+            feature_name: [feature[:1] for feature in features]
+            for feature_name, features in model_data.items()
+        }
 
         self.model = TED(
             model_data.get_signature(),
@@ -359,31 +363,27 @@ def persist(self, path: Text):
             )
             return
 
-        tf_model_file = os.path.join(path, f"{SAVE_MODEL_FILE_NAME}.tf_model")
+        model_path = Path(path)
+        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"
 
-        rasa.utils.io.create_directory_for_file(tf_model_file)
+        io_utils.create_directory_for_file(tf_model_file)
 
         self.featurizer.persist(path)
 
-        self.model.save(tf_model_file)
-
-        with open(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".priority.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.priority, f)
-
-        with open(os.path.join(path, SAVE_MODEL_FILE_NAME + ".meta.pkl"), "wb") as f:
-            pickle.dump(self.config, f)
-
-        with open(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".data_example.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.data_example, f)
+        self.model.save(str(tf_model_file))
 
-        with open(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".label_data.pkl"), "wb"
-        ) as f:
-            pickle.dump(self._label_data, f)
+        io_utils.pickle_dump(
+            model_path / SAVE_MODEL_FILE_NAME + ".priority.json", self.priority
+        )
+        io_utils.pickle_dump(
+            model_path / SAVE_MODEL_FILE_NAME + ".meta.json", self.config
+        )
+        io_utils.pickle_dump(
+            model_path / SAVE_MODEL_FILE_NAME + ".data_example.json", self.data_example
+        )
+        io_utils.pickle_dump(
+            model_path / SAVE_MODEL_FILE_NAME + ".label_data.json", self._label_data
+        )
 
     @classmethod
     def load(cls, path: Text) -> "TEDPolicy":
@@ -398,39 +398,30 @@ def load(cls, path: Text) -> "TEDPolicy":
                 f"'{os.path.abspath(path)}' doesn't exist."
             )
 
-        tf_model_file = os.path.join(path, f"{SAVE_MODEL_FILE_NAME}.tf_model")
+        model_path = Path(path)
+        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"
 
         featurizer = TrackerFeaturizer.load(path)
 
-        if not os.path.exists(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".data_example.pkl")
-        ):
+        if not (model_path / SAVE_MODEL_FILE_NAME + ".data_example.pkl").is_file():
             return cls(featurizer=featurizer)
 
-        with open(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".data_example.pkl"), "rb"
-        ) as f:
-            model_data_example = RasaModelData(
-                label_key="label_ids", data=pickle.load(f)
-            )
-
-        with open(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".label_data.pkl"), "rb"
-        ) as f:
-            label_data = pickle.load(f)
-
-        with open(os.path.join(path, SAVE_MODEL_FILE_NAME + ".meta.pkl"), "rb") as f:
-            meta = pickle.load(f)
-
-        with open(
-            os.path.join(path, SAVE_MODEL_FILE_NAME + ".priority.pkl"), "rb"
-        ) as f:
-            priority = pickle.load(f)
+        loaded_data = io_utils.pickle_load(
+            model_path / SAVE_MODEL_FILE_NAME + ".data_example.pkl"
+        )
+        label_data = io_utils.pickle_load(
+            model_path / SAVE_MODEL_FILE_NAME + ".label_data.pkl"
+        )
+        meta = io_utils.pickle_load(model_path / SAVE_MODEL_FILE_NAME + ".meta.pkl")
+        priority = io_utils.pickle_load(
+            model_path / SAVE_MODEL_FILE_NAME + ".priority.pkl"
+        )
 
+        model_data_example = RasaModelData(label_key="label_ids", data=loaded_data)
         meta = train_utils.update_similarity_type(meta)
 
         model = TED.load(
-            tf_model_file,
+            str(tf_model_file),
             model_data_example,
             data_signature=model_data_example.get_signature(),
             config=meta,
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c8f38f5bb1cb..db89d845b0ed 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 
 import numpy as np
 import os
@@ -10,7 +11,7 @@
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union
 
-import rasa.utils.io
+import rasa.utils.io as io_utils
 import rasa.nlu.utils.bilou_utils as bilou_utils
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.test import determine_token_labels
@@ -246,7 +247,7 @@ def __init__(
         model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
-        """Declare instance variables with default values"""
+        """Declare instance variables with default values."""
 
         super().__init__(component_config)
 
@@ -786,30 +787,28 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         if self.model is None:
             return {"file": None}
 
-        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
-
-        rasa.utils.io.create_directory_for_file(tf_model_file)
-
-        self.model.save(tf_model_file)
-
-        with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "wb") as f:
-            pickle.dump(self.data_example, f)
+        model_dir = Path(model_dir)
+        tf_model_file = model_dir / f"{file_name}.tf_model"
 
-        with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "wb") as f:
-            pickle.dump(self._label_data, f)
+        io_utils.create_directory_for_file(tf_model_file)
 
-        with open(
-            os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.inverted_label_dict, f)
+        self.model.save(str(tf_model_file))
 
-        with open(os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "wb") as f:
-            pickle.dump(self.inverted_tag_dict, f)
-
-        with open(
-            os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.batch_tuple_sizes, f)
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.data_example.pkl", self.data_example
+        )
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.label_data.pkl", self._label_data
+        )
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.inv_label_dict.pkl", self.inverted_label_dict
+        )
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.inv_tag_dict.pkl", self.inverted_tag_dict
+        )
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.batch_tuple_sizes.pkl", self.batch_tuple_sizes
+        )
 
         return {"file": file_name}
 
@@ -856,24 +855,17 @@ def load(
     def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
         file_name = meta.get("file")
 
-        with open(os.path.join(model_dir, file_name + ".data_example.pkl"), "rb") as f:
-            data_example = pickle.load(f)
-
-        with open(os.path.join(model_dir, file_name + ".label_data.pkl"), "rb") as f:
-            label_data = pickle.load(f)
-
-        with open(
-            os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "rb"
-        ) as f:
-            inv_label_dict = pickle.load(f)
+        model_dir = Path(model_dir)
 
-        with open(os.path.join(model_dir, file_name + ".inv_tag_dict.pkl"), "rb") as f:
-            inv_tag_dict = pickle.load(f)
-
-        with open(
-            os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "rb"
-        ) as f:
-            batch_tuple_sizes = pickle.load(f)
+        data_example = io_utils.pickle_load(model_dir / f"{file_name}.data_example.pkl")
+        label_data = io_utils.pickle_load(model_dir / f"{file_name}.label_data.pkl")
+        inv_label_dict = io_utils.pickle_load(
+            model_dir / f"{file_name}.inv_label_dict.pkl"
+        )
+        inv_tag_dict = io_utils.pickle_load(model_dir / f"{file_name}.inv_tag_dict.pkl")
+        batch_tuple_sizes = io_utils.pickle_load(
+            model_dir / f"{file_name}.batch_tuple_sizes.pkl"
+        )
 
         return (
             batch_tuple_sizes,
diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index 406a1e47a60c..16ce344380ac 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import pickle
 import tarfile
 import tempfile
 import typing
@@ -157,6 +158,18 @@ def dump_obj_as_json_to_file(filename: Text, obj: Any) -> None:
     write_text_file(json.dumps(obj, indent=2), filename)
 
 
+def pickle_dump(filename: Text, obj: Any):
+    """Saves object to file."""
+    with open(filename, "wb") as f:
+        pickle.dump(obj, f)
+
+
+def pickle_load(filename: Text) -> Any:
+    """Loads an object from a file."""
+    with open(filename, "rb") as f:
+        return pickle.load(f)
+
+
 def read_config_file(filename: Text) -> Dict[Text, Any]:
     """Parses a yaml configuration file. Content needs to be a dictionary
 

From 64ff5ca5955312aeaa689131b57bb0fa2b979462 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 16:08:35 +0100
Subject: [PATCH 530/633] review comments

---
 rasa/core/policies/fallback.py       |  4 ++--
 rasa/core/policies/form_policy.py    |  2 +-
 rasa/core/policies/mapping_policy.py |  2 +-
 rasa/core/policies/policy.py         |  4 ++++
 rasa/core/policies/ted_policy.py     | 23 +++++++++--------------
 5 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/rasa/core/policies/fallback.py b/rasa/core/policies/fallback.py
index 1f314ed89611..d700be0f052f 100644
--- a/rasa/core/policies/fallback.py
+++ b/rasa/core/policies/fallback.py
@@ -128,7 +128,7 @@ def fallback_scores(
     ) -> List[float]:
         """Prediction scores used if a fallback is necessary."""
 
-        result = [0.0] * domain.num_actions
+        result = self._default_predictions(domain)
         idx = domain.index_for_action(self.fallback_action_name)
         result[idx] = fallback_score
         return result
@@ -145,7 +145,7 @@ def predict_action_probabilities(
         nlu_data = tracker.latest_message.parse_data
 
         if tracker.latest_action_name == self.fallback_action_name:
-            result = [0.0] * domain.num_actions
+            result = self._default_predictions(domain)
             idx = domain.index_for_action(ACTION_LISTEN_NAME)
             result[idx] = 1.0
 
diff --git a/rasa/core/policies/form_policy.py b/rasa/core/policies/form_policy.py
index f6af451c8d16..3deab0bc4ced 100644
--- a/rasa/core/policies/form_policy.py
+++ b/rasa/core/policies/form_policy.py
@@ -140,7 +140,7 @@ def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
     ) -> List[float]:
         """Predicts the corresponding form action if there is an active form"""
-        result = [0.0] * domain.num_actions
+        result = self._default_predictions(domain)
 
         if tracker.active_form.get("name"):
             logger.debug(
diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py
index ebf92aa14401..413aed58ac9f 100644
--- a/rasa/core/policies/mapping_policy.py
+++ b/rasa/core/policies/mapping_policy.py
@@ -91,7 +91,7 @@ def predict_action_probabilities(
         predicted with the highest probability of all policies. If it is not
         the policy will predict zero for every action."""
 
-        prediction = [0.0] * domain.num_actions
+        prediction = self._default_predictions(domain)
         intent = tracker.latest_message.intent.get("name")
         if intent == USER_INTENT_RESTART:
             action = ACTION_RESTART_NAME
diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index 16ffd1bc4d9a..9a95ba044a40 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -107,6 +107,10 @@ def load(cls, path: Text) -> "Policy":
             Needs to load its featurizer"""
         raise NotImplementedError("Policy must have the capacity to load itself.")
 
+    @staticmethod
+    def _default_predictions(domain: Domain) -> List[float]:
+        return [0.0] * domain.num_actions
+
 
 def confidence_scores_for(action_name, value, domain) -> List[float]:
     """Returns confidence scores if a single action is predicted.
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 5766c71b6ee3..6c9d16bb0b1d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1,7 +1,6 @@
 import copy
 import logging
 import os
-import pickle
 from pathlib import Path
 
 import numpy as np
@@ -216,9 +215,8 @@ def _label_ids_for_Y(data_Y: np.ndarray) -> np.ndarray:
     # noinspection PyPep8Naming
     def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
         """Prepare Y data for training: features for label_ids."""
-
-        # full dialogue featurizer is used
-        if len(label_ids.shape) == 2:
+        is_full_dialogue_featurizer_used = len(label_ids.shape) == 2
+        if is_full_dialogue_featurizer_used:
             return np.stack(
                 [
                     np.stack(
@@ -281,20 +279,17 @@ def train(
     ) -> None:
         """Train the policy on given training trackers."""
 
-        # set numpy random seed
-        np.random.seed(self.config[RANDOM_SEED])
-
         # dealing with training data
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
 
         self._label_data = self._create_label_data(domain)
 
         # check if number of negatives is less than number of label_ids
-        logger.debug(
-            f"Check if num_neg {self.config[NUM_NEG]} is smaller "
-            f"than number of label_ids {domain.num_actions}, "
-            f"else set num_neg to the number of label_ids - 1."
-        )
+        if self.config[NUM_NEG] < domain.num_actions:
+            logger.debug(
+                f"Set '{NUM_NEG}' to the number of actions - 1, e.g. "
+                f"{domain.num_actions - 1}."
+            )
         self.config[NUM_NEG] = min(self.config[NUM_NEG], domain.num_actions - 1)
 
         # extract actual training data to feed to model
@@ -336,7 +331,7 @@ def predict_action_probabilities(
         Return the list of probabilities for the next actions.
         """
         if self.model is None:
-            return [0.0] * domain.num_actions
+            return self._default_predictions(domain)
 
         # create model data from tracker
         data_X = self.featurizer.create_X([tracker], domain)
@@ -352,7 +347,7 @@ def predict_action_probabilities(
 
         return confidence.tolist()
 
-    def persist(self, path: Text):
+    def persist(self, path: Text) -> None:
         """Persists the policy to a storage."""
 
         if self.model is None:

From 16466efdd5ea052046d1b6ba236ec52b7d19a755 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 16:21:21 +0100
Subject: [PATCH 531/633] review comments

---
 examples/concertbot/config.yml       |  3 ++-
 examples/formbot/config.yml          |  2 ++
 examples/knowledgebasebot/config.yml |  4 +++-
 examples/moodbot/config.yml          |  4 +++-
 examples/restaurantbot/config.yml    |  5 +++--
 tests/nlu/test_config.py             | 19 ++++++++-----------
 6 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/examples/concertbot/config.yml b/examples/concertbot/config.yml
index 2f46ed015b68..14b58dfd276f 100644
--- a/examples/concertbot/config.yml
+++ b/examples/concertbot/config.yml
@@ -3,6 +3,7 @@ language: en
 pipeline:
   - name: "WhitespaceTokenizer"
   - name: "RegexFeaturizer"
+  - name: "LexicalSyntacticFeaturizer"
   - name: "CountVectorsFeaturizer"
   - name: "CountVectorsFeaturizer"
     analyzer: "char_wb"
@@ -12,7 +13,7 @@ pipeline:
   - name: "EntitySynonymMapper"
 
 policies:
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 200
     batch_size: 50
diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 7b6e73188076..e4ef40b93e77 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,6 +2,7 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
+  - name: LexicalSyntacticFeaturizer
   - name: CountVectorsFeaturizer
     token_pattern: (?u)\b\w+\b
   - name: DucklingHTTPExtractor
@@ -9,6 +10,7 @@ pipeline:
     dimensions:
       - number
   - name: DIETClassifier
+    epochs: 100
   - name: EntitySynonymMapper
 
 policies:
diff --git a/examples/knowledgebasebot/config.yml b/examples/knowledgebasebot/config.yml
index 06a094b32b73..092617156b27 100644
--- a/examples/knowledgebasebot/config.yml
+++ b/examples/knowledgebasebot/config.yml
@@ -3,16 +3,18 @@ language: en
 pipeline:
   - name: "WhitespaceTokenizer"
   - name: "RegexFeaturizer"
+  - name: "LexicalSyntacticFeaturizer"
   - name: "CountVectorsFeaturizer"
   - name: "CountVectorsFeaturizer"
     analyzer: "char_wb"
     min_ngram: 1
     max_ngram: 4
   - name: "DIETClassifier"
+    epochs: 100
   - name: "EntitySynonymMapper"
 
 policies:
   - name: MemoizationPolicy
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 100
diff --git a/examples/moodbot/config.yml b/examples/moodbot/config.yml
index d78149cea8f4..2e1c27bd1c62 100644
--- a/examples/moodbot/config.yml
+++ b/examples/moodbot/config.yml
@@ -5,11 +5,13 @@ pipeline:
   - name: "SpacyTokenizer"
   - name: "SpacyFeaturizer"
   - name: "RegexFeaturizer"
+  - name: "LexicalSyntacticFeaturizer"
   - name: "DIETClassifier"
+    epochs: 100
   - name: "EntitySynonymMapper"
 
 policies:
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 100
   - name: MemoizationPolicy
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index 9bd0371277b3..291570d0b96f 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -4,8 +4,9 @@ pipeline:
   - name: "SpacyNLP"
   - name: "SpacyTokenizer"
   - name: "SpacyFeaturizer"
-  - name: "SklearnIntentClassifier"
-  - name: "CRFEntityExtractor"
+  - name: "LexicalSyntacticFeaturizer"
+  - name: "DIETClassifier"
+    epochs: 100
     features: [
       ["low", "title", "upper"],
       [
diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py
index 36cba509683a..2714bedcbcbc 100644
--- a/tests/nlu/test_config.py
+++ b/tests/nlu/test_config.py
@@ -109,22 +109,19 @@ def test_set_attr_on_component():
                 {"name": "SpacyNLP"},
                 {"name": "SpacyTokenizer"},
                 {"name": "SpacyFeaturizer"},
-                {"name": "RegexFeaturizer"},
-                {"name": "CRFEntityExtractor"},
-                {"name": "EntitySynonymMapper"},
-                {"name": "SklearnIntentClassifier"},
+                {"name": "DIETClassifier"},
             ],
         }
     )
-    idx_classifier = _config.component_names.index("SklearnIntentClassifier")
+    idx_classifier = _config.component_names.index("DIETClassifier")
     idx_tokenizer = _config.component_names.index("SpacyTokenizer")
 
-    _config.set_component_attr(idx_classifier, C=324)
+    _config.set_component_attr(idx_classifier, epochs=10)
 
     assert _config.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"}
     assert _config.for_component(idx_classifier) == {
-        "name": "SklearnIntentClassifier",
-        "C": 324,
+        "name": "DIETClassifier",
+        "epochs": 10,
     }
 
 
@@ -138,13 +135,13 @@ def test_override_defaults_supervised_embeddings_pipeline():
                 {"name": "SpacyNLP"},
                 {"name": "SpacyTokenizer"},
                 {"name": "SpacyFeaturizer", "pooling": "max"},
-                {"name": "SklearnIntentClassifier", "max_cross_validation_folds": 1},
+                {"name": "DIETClassifier", "epochs": 10},
             ],
         }
     )
 
     idx_featurizer = _config.component_names.index("SpacyFeaturizer")
-    idx_classifier = _config.component_names.index("SklearnIntentClassifier")
+    idx_classifier = _config.component_names.index("DIETClassifier")
 
     component1 = builder.create_component(
         _config.for_component(idx_featurizer), _config
@@ -154,7 +151,7 @@ def test_override_defaults_supervised_embeddings_pipeline():
     component2 = builder.create_component(
         _config.for_component(idx_classifier), _config
     )
-    assert component2.component_config["max_cross_validation_folds"] == 1
+    assert component2.component_config["epochs"] == 10
 
 
 def config_files_in(config_directory: Text):

From 765939fbe5b003a9e6702084c48b611b870cc3e0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 17:02:57 +0100
Subject: [PATCH 532/633] use jsonpickle instead of pickle

---
 rasa/core/policies/sklearn_policy.py          |  8 ++---
 rasa/core/policies/ted_policy.py              | 32 +++++++++---------
 rasa/nlu/classifiers/diet_classifier.py       | 33 ++++++++++++-------
 .../classifiers/sklearn_intent_classifier.py  | 10 +++---
 .../count_vectors_featurizer.py               |  7 ++--
 .../lexical_syntactic_featurizer.py           | 15 +++------
 rasa/nlu/utils/__init__.py                    | 21 ------------
 rasa/utils/io.py                              | 25 ++++++++++++--
 8 files changed, 77 insertions(+), 74 deletions(-)

diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py
index c25326a77802..fee324f4432d 100644
--- a/rasa/core/policies/sklearn_policy.py
+++ b/rasa/core/policies/sklearn_policy.py
@@ -176,8 +176,7 @@ def persist(self, path: Text) -> None:
             rasa.utils.io.dump_obj_as_json_to_file(meta_file, meta)
 
             filename = os.path.join(path, "sklearn_model.pkl")
-            with open(filename, "wb") as f:
-                pickle.dump(self._state, f)
+            rasa.utils.io.pickle_dump(filename, self._state)
         else:
             raise_warning(
                 "Persist called without a trained model present. "
@@ -201,10 +200,11 @@ def load(cls, path: Text) -> Policy:
 
         meta_file = os.path.join(path, "sklearn_policy.json")
         meta = json.loads(rasa.utils.io.read_file(meta_file))
+
         policy = cls(featurizer=featurizer, priority=meta["priority"])
 
-        with open(filename, "rb") as f:
-            state = pickle.load(f)
+        state = rasa.utils.io.pickle_load(filename)
+
         vars(policy).update(state)
 
         logger.info("Loaded sklearn model")
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 6c9d16bb0b1d..2997cda1e389 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -367,17 +367,17 @@ def persist(self, path: Text) -> None:
 
         self.model.save(str(tf_model_file))
 
-        io_utils.pickle_dump(
-            model_path / SAVE_MODEL_FILE_NAME + ".priority.json", self.priority
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.json", self.priority
         )
-        io_utils.pickle_dump(
-            model_path / SAVE_MODEL_FILE_NAME + ".meta.json", self.config
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.meta.json", self.config
         )
-        io_utils.pickle_dump(
-            model_path / SAVE_MODEL_FILE_NAME + ".data_example.json", self.data_example
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.json", self.data_example
         )
-        io_utils.pickle_dump(
-            model_path / SAVE_MODEL_FILE_NAME + ".label_data.json", self._label_data
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.json", self._label_data
         )
 
     @classmethod
@@ -398,18 +398,18 @@ def load(cls, path: Text) -> "TEDPolicy":
 
         featurizer = TrackerFeaturizer.load(path)
 
-        if not (model_path / SAVE_MODEL_FILE_NAME + ".data_example.pkl").is_file():
+        if not (model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file():
             return cls(featurizer=featurizer)
 
-        loaded_data = io_utils.pickle_load(
-            model_path / SAVE_MODEL_FILE_NAME + ".data_example.pkl"
+        loaded_data = io_utils.json_unpickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl"
         )
-        label_data = io_utils.pickle_load(
-            model_path / SAVE_MODEL_FILE_NAME + ".label_data.pkl"
+        label_data = io_utils.json_unpickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
         )
-        meta = io_utils.pickle_load(model_path / SAVE_MODEL_FILE_NAME + ".meta.pkl")
-        priority = io_utils.pickle_load(
-            model_path / SAVE_MODEL_FILE_NAME + ".priority.pkl"
+        meta = io_utils.json_unpickle(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
+        priority = io_utils.json_unpickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
         )
 
         model_data_example = RasaModelData(label_key="label_ids", data=loaded_data)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index db89d845b0ed..07225d9128b4 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import os
-import pickle
 import scipy.sparse
 import warnings
 import tensorflow as tf
@@ -797,14 +796,14 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         io_utils.pickle_dump(
             model_dir / f"{file_name}.data_example.pkl", self.data_example
         )
-        io_utils.pickle_dump(
+        io_utils.json_pickle(
             model_dir / f"{file_name}.label_data.pkl", self._label_data
         )
-        io_utils.pickle_dump(
-            model_dir / f"{file_name}.inv_label_dict.pkl", self.inverted_label_dict
+        io_utils.json_pickle(
+            model_dir / f"{file_name}.inverted_label_dict.pkl", self.inverted_label_dict
         )
-        io_utils.pickle_dump(
-            model_dir / f"{file_name}.inv_tag_dict.pkl", self.inverted_tag_dict
+        io_utils.json_pickle(
+            model_dir / f"{file_name}.inverted_tag_dict.pkl", self.inverted_tag_dict
         )
         io_utils.pickle_dump(
             model_dir / f"{file_name}.batch_tuple_sizes.pkl", self.batch_tuple_sizes
@@ -858,19 +857,29 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
         model_dir = Path(model_dir)
 
         data_example = io_utils.pickle_load(model_dir / f"{file_name}.data_example.pkl")
-        label_data = io_utils.pickle_load(model_dir / f"{file_name}.label_data.pkl")
-        inv_label_dict = io_utils.pickle_load(
-            model_dir / f"{file_name}.inv_label_dict.pkl"
+        label_data = io_utils.json_unpickle(model_dir / f"{file_name}.label_data.pkl")
+        inverted_label_dict = io_utils.json_unpickle(
+            model_dir / f"{file_name}.inverted_label_dict.pkl"
+        )
+        inverted_tag_dict = io_utils.json_unpickle(
+            model_dir / f"{file_name}.inverted_tag_dict.pkl"
         )
-        inv_tag_dict = io_utils.pickle_load(model_dir / f"{file_name}.inv_tag_dict.pkl")
         batch_tuple_sizes = io_utils.pickle_load(
             model_dir / f"{file_name}.batch_tuple_sizes.pkl"
         )
 
+        # jsonpickle converts dictionary keys to strings
+        inverted_label_dict = {
+            int(key): value for key, value in inverted_label_dict.items()
+        }
+        inverted_tag_dict = {
+            int(key): value for key, value in inverted_tag_dict.items()
+        }
+
         return (
             batch_tuple_sizes,
-            inv_label_dict,
-            inv_tag_dict,
+            inverted_label_dict,
+            inverted_tag_dict,
             label_data,
             meta,
             data_example,
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index a2b6bb161834..0cdbe0d94949 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -6,8 +6,8 @@
 
 import numpy as np
 
+import rasa.utils.io as io_utils
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
-from rasa.nlu import utils
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -229,10 +229,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
         classifier_file_name = file_name + "_classifier.pkl"
         encoder_file_name = file_name + "_encoder.pkl"
         if self.clf and self.le:
-            utils.json_pickle(
+            io_utils.json_pickle(
                 os.path.join(model_dir, encoder_file_name), self.le.classes_
             )
-            utils.json_pickle(
+            io_utils.json_pickle(
                 os.path.join(model_dir, classifier_file_name), self.clf.best_estimator_
             )
         return {"classifier": classifier_file_name, "encoder": encoder_file_name}
@@ -252,8 +252,8 @@ def load(
         encoder_file = os.path.join(model_dir, meta.get("encoder"))
 
         if os.path.exists(classifier_file):
-            classifier = utils.json_unpickle(classifier_file)
-            classes = utils.json_unpickle(encoder_file)
+            classifier = io_utils.json_unpickle(classifier_file)
+            classes = io_utils.json_unpickle(encoder_file)
             encoder = LabelEncoder()
             encoder.classes_ = classes
             return cls(meta, classifier, encoder)
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 8e4cc21baa02..01d722ad61b8 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -6,9 +6,8 @@
 
 from rasa.constants import DOCS_URL_COMPONENTS
 import rasa.utils.common as common_utils
-
+import rasa.utils.io as io_utils
 from sklearn.feature_extraction.text import CountVectorizer
-from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.model import Metadata
@@ -572,7 +571,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
                 else:
                     vocab = attribute_vocabularies
 
-                utils.json_pickle(featurizer_file, vocab)
+                io_utils.json_pickle(featurizer_file, vocab)
 
         return {"file": file_name}
 
@@ -646,7 +645,7 @@ def load(
         if not os.path.exists(featurizer_file):
             return cls(meta)
 
-        vocabulary = utils.json_unpickle(featurizer_file)
+        vocabulary = io_utils.json_unpickle(featurizer_file)
 
         share_vocabulary = meta["use_shared_vocab"]
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index aad615727f98..d784f863b8ad 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -1,9 +1,8 @@
-import json
 import logging
 from collections import defaultdict, OrderedDict
+from pathlib import Path
 
 import numpy as np
-import os
 import scipy.sparse
 from typing import Any, Dict, Optional, Text, List
 
@@ -269,10 +268,8 @@ def load(
 
         file_name = meta.get("file")
 
-        with open(
-            os.path.join(model_dir, file_name + ".feature_to_idx_dict.pkl"), "rb"
-        ) as f:
-            feature_to_idx_dict = json.load(f)
+        feature_to_idx_file = Path(model_dir) / f"{file_name}.feature_to_idx_dict.pkl"
+        feature_to_idx_dict = io_utils.json_unpickle(feature_to_idx_file)
 
         return LexicalSyntacticFeaturizer(meta, feature_to_idx_dict=feature_to_idx_dict)
 
@@ -280,9 +277,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
         """Persist this model into the passed directory.
         Return the metadata necessary to load the model again."""
 
-        feature_to_idx_file = os.path.join(
-            model_dir, file_name + ".feature_to_idx_dict.pkl"
-        )
-        io_utils.dump_obj_as_json_to_file(feature_to_idx_file, self.feature_to_idx_dict)
+        feature_to_idx_file = Path(model_dir) / f"{file_name}.feature_to_idx_dict.pkl"
+        io_utils.json_pickle(feature_to_idx_file, self.feature_to_idx_dict)
 
         return {"file": file_name}
diff --git a/rasa/nlu/utils/__init__.py b/rasa/nlu/utils/__init__.py
index 21dbba149f48..528f990cc09a 100644
--- a/rasa/nlu/utils/__init__.py
+++ b/rasa/nlu/utils/__init__.py
@@ -104,24 +104,3 @@ def remove_model(model_dir: Text) -> bool:
             "Cannot remove {}, it seems it is not a model "
             "directory".format(model_dir)
         )
-
-
-def json_unpickle(file_name: Text) -> Any:
-    """Unpickle an object from file using json."""
-    import jsonpickle.ext.numpy as jsonpickle_numpy
-    import jsonpickle
-
-    jsonpickle_numpy.register_handlers()
-
-    file_content = io_utils.read_file(file_name)
-    return jsonpickle.loads(file_content)
-
-
-def json_pickle(file_name: Text, obj: Any) -> None:
-    """Pickle an object to a file using json."""
-    import jsonpickle.ext.numpy as jsonpickle_numpy
-    import jsonpickle
-
-    jsonpickle_numpy.register_handlers()
-
-    io_utils.write_text_file(jsonpickle.dumps(obj), file_name)
diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index 16ce344380ac..dc820c046ad9 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -158,13 +158,13 @@ def dump_obj_as_json_to_file(filename: Text, obj: Any) -> None:
     write_text_file(json.dumps(obj, indent=2), filename)
 
 
-def pickle_dump(filename: Text, obj: Any):
+def pickle_dump(filename: Union[Text, Path], obj: Any):
     """Saves object to file."""
     with open(filename, "wb") as f:
         pickle.dump(obj, f)
 
 
-def pickle_load(filename: Text) -> Any:
+def pickle_load(filename: Union[Text, Path]) -> Any:
     """Loads an object from a file."""
     with open(filename, "rb") as f:
         return pickle.load(f)
@@ -407,3 +407,24 @@ def zip_folder(folder: Text) -> Text:
 
     # WARN: not thread-safe!
     return shutil.make_archive(zipped_path.name, "zip", folder)
+
+
+def json_unpickle(file_name: Union[Text, Path]) -> Any:
+    """Unpickle an object from file using json."""
+    import jsonpickle.ext.numpy as jsonpickle_numpy
+    import jsonpickle
+
+    jsonpickle_numpy.register_handlers()
+
+    file_content = read_file(file_name)
+    return jsonpickle.loads(file_content)
+
+
+def json_pickle(file_name: Union[Text, Path], obj: Any) -> None:
+    """Pickle an object to a file using json."""
+    import jsonpickle.ext.numpy as jsonpickle_numpy
+    import jsonpickle
+
+    jsonpickle_numpy.register_handlers()
+
+    write_text_file(jsonpickle.dumps(obj), file_name)

From 8eb283f1e8b0e84143b5fefa0e6f9615944d72f4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 17:30:26 +0100
Subject: [PATCH 533/633] fix types

---
 rasa/utils/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index dc820c046ad9..4094d27805ca 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -280,7 +280,7 @@ def create_path(file_path: Text) -> None:
         os.makedirs(parent_dir)
 
 
-def create_directory_for_file(file_path: Text) -> None:
+def create_directory_for_file(file_path: Union[Text, Path]) -> None:
     """Creates any missing parent directories of this file path."""
 
     create_directory(os.path.dirname(file_path))

From d5ac30e6c9fb67fca5aa356d9e34ce18a9206b56 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 21 Feb 2020 17:36:07 +0100
Subject: [PATCH 534/633] print warning on epochs not set.

---
 rasa/nlu/classifiers/diet_classifier.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 07225d9128b4..711508569dc7 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -248,6 +248,12 @@ def __init__(
     ) -> None:
         """Declare instance variables with default values."""
 
+        if component_config is not None and EPOCHS not in component_config:
+            logger.warning(
+                f"Please configure the number of '{EPOCHS}' in your configuration file."
+                f" We will change the default value of '{EPOCHS}' in the future to 1. "
+            )
+
         super().__init__(component_config)
 
         self._check_config_parameters()

From b45e1f4b89a684920b296fd9226d886345ae9c4a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 21 Feb 2020 17:54:21 +0100
Subject: [PATCH 535/633] deprecate provides and requires in nlu

---
 changelog/5266.misc.rst                       |   3 +
 docs/api/custom-nlu-components.rst            |   2 +
 rasa/nlu/classifiers/classifier.py            |   5 +
 rasa/nlu/classifiers/diet_classifier.py       |  13 +-
 .../embedding_intent_classifier.py            |  12 +-
 .../classifiers/keyword_intent_classifier.py  |   7 +-
 .../classifiers/mitie_intent_classifier.py    |  13 +-
 .../classifiers/sklearn_intent_classifier.py  |  11 +-
 rasa/nlu/components.py                        | 132 +++++++-----------
 rasa/nlu/extractors/__init__.py               |  90 ------------
 rasa/nlu/extractors/crf_entity_extractor.py   |  10 +-
 .../nlu/extractors/duckling_http_extractor.py |   4 +-
 rasa/nlu/extractors/entity_synonyms.py        |   5 +-
 rasa/nlu/extractors/extractor.py              |  90 ++++++++++++
 rasa/nlu/extractors/mitie_entity_extractor.py |  12 +-
 rasa/nlu/extractors/spacy_entity_extractor.py |  10 +-
 .../dense_featurizer/convert_featurizer.py    |  14 +-
 .../dense_featurizer/lm_featurizer.py         |  20 +--
 .../dense_featurizer/mitie_featurizer.py      |  25 ++--
 .../dense_featurizer/spacy_featurizer.py      |  25 ++--
 rasa/nlu/featurizers/featurizer.py            |  48 ++++---
 .../count_vectors_featurizer.py               |  11 +-
 .../lexical_syntactic_featurizer.py           |  12 +-
 .../sparse_featurizer/ngram_featurizer.py     |   4 +-
 .../sparse_featurizer/regex_featurizer.py     |  12 +-
 rasa/nlu/model.py                             |   1 -
 rasa/nlu/selectors/response_selector.py       |  15 +-
 rasa/nlu/tokenizers/convert_tokenizer.py      |   8 +-
 rasa/nlu/tokenizers/jieba_tokenizer.py        |   2 -
 rasa/nlu/tokenizers/lm_tokenizer.py           |  10 +-
 rasa/nlu/tokenizers/mitie_tokenizer.py        |   2 -
 rasa/nlu/tokenizers/spacy_tokenizer.py        |   9 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |   2 -
 .../nlu/utils/hugging_face/hf_transformers.py |   4 -
 rasa/nlu/utils/mitie_utils.py                 |   2 -
 rasa/nlu/utils/spacy_utils.py                 |   3 -
 rasa/utils/tensorflow/transformer.py          |   3 +-
 tests/nlu/example_component.py                |  40 +++---
 tests/nlu/test_config.py                      |   5 +-
 39 files changed, 305 insertions(+), 391 deletions(-)
 create mode 100644 rasa/nlu/classifiers/classifier.py
 create mode 100644 rasa/nlu/extractors/extractor.py

diff --git a/changelog/5266.misc.rst b/changelog/5266.misc.rst
index 0fc0fd323290..70c4b8221986 100644
--- a/changelog/5266.misc.rst
+++ b/changelog/5266.misc.rst
@@ -7,3 +7,6 @@ include in your pipeline.
 Use ``DIETClassifier`` instead of ``SklearnIntentClassifier``.
 
 Use ``TEDPolicy`` instead of ``KerasPolicy``.
+
+Properties ``Component.provides`` and ``Component.requires`` are deprecated.
+Use ``Component.required_components()`` instead.
\ No newline at end of file
diff --git a/docs/api/custom-nlu-components.rst b/docs/api/custom-nlu-components.rst
index 733e65d31f45..f41f61b7ba15 100644
--- a/docs/api/custom-nlu-components.rst
+++ b/docs/api/custom-nlu-components.rst
@@ -51,6 +51,8 @@ Component
 
 .. autoclass:: rasa.nlu.components.Component
 
+   .. automethod:: required_components
+
    .. automethod:: required_packages
 
    .. automethod:: create
diff --git a/rasa/nlu/classifiers/classifier.py b/rasa/nlu/classifiers/classifier.py
new file mode 100644
index 000000000000..ee9d5cc73373
--- /dev/null
+++ b/rasa/nlu/classifiers/classifier.py
@@ -0,0 +1,5 @@
+from rasa.nlu.components import Component
+
+
+class IntentClassifier(Component):
+    pass
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 5bf81276a9ab..1ef34df74468 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -12,11 +12,12 @@
 
 import rasa.utils.io
 import rasa.nlu.utils.bilou_utils as bilou_utils
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.classifiers.classifier import IntentClassifier
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
-from rasa.nlu.components import any_of
 from rasa.utils import train_utils
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -78,7 +79,7 @@
 logger = logging.getLogger(__name__)
 
 
-class DIETClassifier(EntityExtractor):
+class DIETClassifier(IntentClassifier, EntityExtractor):
     """DIET (Dual Intent and Entity Transformer) is a multi-task architecture for
     intent classification and entity recognition.
 
@@ -91,9 +92,9 @@ class DIETClassifier(EntityExtractor):
     similarities with negative samples.
     """
 
-    provides = ["intent", "intent_ranking", "entities"]
-
-    requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Featurizer]
 
     # please make sure to update the docs when changing a default parameter
     defaults = {
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 75d36785e8f7..31172a62194a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,9 +1,9 @@
 import logging
-from typing import Any, Dict, Optional, Text
+from typing import Any, Dict, Optional, Text, List
 
-from rasa.nlu.components import any_of
+from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
-from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES
+from rasa.nlu.constants import TEXT
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -54,9 +54,9 @@ class EmbeddingIntentClassifier(DIETClassifier):
     This algorithm also provides similarity rankings of the labels that did not "win".
     """
 
-    provides = ["intent", "intent_ranking"]
-
-    requires = [any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT])]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Featurizer]
 
     # please make sure to update the docs when changing a default parameter
     defaults = {
diff --git a/rasa/nlu/classifiers/keyword_intent_classifier.py b/rasa/nlu/classifiers/keyword_intent_classifier.py
index ab5d5ddba15e..adebe6217243 100644
--- a/rasa/nlu/classifiers/keyword_intent_classifier.py
+++ b/rasa/nlu/classifiers/keyword_intent_classifier.py
@@ -1,12 +1,11 @@
 import os
 import logging
-import typing
 import re
 from typing import Any, Dict, Optional, Text
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu import utils
-from rasa.nlu.components import Component
+from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.constants import INTENT
 from rasa.utils.common import raise_warning
 from rasa.nlu.config import RasaNLUModelConfig
@@ -17,7 +16,7 @@
 logger = logging.getLogger(__name__)
 
 
-class KeywordIntentClassifier(Component):
+class KeywordIntentClassifier(IntentClassifier):
     """Intent classifier using simple keyword matching.
 
 
@@ -26,8 +25,6 @@ class KeywordIntentClassifier(Component):
 
     """
 
-    provides = [INTENT]
-
     defaults = {"case_sensitive": True}
 
     def __init__(
diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index dd05a58f9c77..ba3f7e727aa6 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -2,7 +2,9 @@
 import typing
 from typing import Any, Dict, List, Optional, Text
 
-from rasa.nlu.components import Component
+from rasa.nlu.utils.mitie_utils import MitieNLP
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from rasa.nlu.constants import TOKENS_NAMES, TEXT, INTENT
@@ -12,11 +14,10 @@
     import mitie
 
 
-class MitieIntentClassifier(Component):
-
-    provides = [INTENT]
-
-    requires = [TOKENS_NAMES[TEXT], "mitie_feature_extractor", "mitie_file"]
+class MitieIntentClassifier(IntentClassifier):
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [MitieNLP, Tokenizer]
 
     def __init__(
         self, component_config: Optional[Dict[Text, Any]] = None, clf=None
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index a2b6bb161834..da3ea17a290b 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -9,7 +9,8 @@
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 from rasa.nlu import utils
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
-from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT
 from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
@@ -23,12 +24,12 @@
     import sklearn
 
 
-class SklearnIntentClassifier(Component):
+class SklearnIntentClassifier(IntentClassifier):
     """Intent classifier using the sklearn framework"""
 
-    provides = ["intent", "intent_ranking"]
-
-    requires = [DENSE_FEATURE_NAMES[TEXT]]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [DenseFeaturizer]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index e80d9936914c..40767ab7c33d 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
 
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults, InvalidConfigError
-from rasa.nlu.constants import RESPONSE
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
 
@@ -68,78 +67,51 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
         )
 
 
-def validate_required_components(pipeline: List["Component"]) -> None:
-    """Validates that all required components are present in the pipeline."""
-
-    unique_component_names = set()
-    for component in pipeline:
-        unique_component_names.add(component.name)
-        if not set(component.required_components).issubset(unique_component_names):
-            raise InvalidConfigError(
-                f"'{component.name}' requires {component.required_components}. "
-                f"Add required components to the pipeline."
-            )
-
-
-def validate_arguments(
-    pipeline: List["Component"],
-    context: Dict[Text, Any],
-    allow_empty_pipeline: bool = False,
-) -> None:
-    """Validates that all arguments are present to train the pipeline."""
-
-    # Ensure the pipeline is not empty
-    if not allow_empty_pipeline and len(pipeline) == 0:
-        raise InvalidConfigError(
-            "Can not train an empty pipeline. "
-            "Make sure to specify a proper pipeline in "
-            "the configuration using the 'pipeline' key. "
-            "The 'backend' configuration key is "
-            "NOT supported anymore."
-        )
-
-    provided_properties = set(context.keys())
-
-    for component in pipeline:
-        for r in component.requires:
-            if isinstance(r, Tuple):
-                validate_requires_any_of(r, provided_properties, str(component.name))
-            else:
-                if r not in provided_properties:
-                    raise InvalidConfigError(
-                        f"Failed to validate component {component.name}. "
-                        f"Missing property: '{r}'"
-                    )
-
-        provided_properties.update(component.provides)
-
+def _required_component_in_pipeline(
+    required_component: Any, pipeline: List["Component"]
+) -> bool:
+    """Checks that required component present in the pipeline."""
 
-def any_of(*args) -> Tuple[Any]:
-    """Helper function to define that one of the given arguments is required.
-
-    Should be used inside `requires`.
-    """
+    for previous_component in pipeline:
+        if isinstance(previous_component, required_component):
+            return True
+    return False
 
-    return args
 
+def validate_required_components(pipeline: List["Component"]) -> None:
+    """Validates that all required components are present in the pipeline."""
 
-def validate_requires_any_of(
-    required_properties: Tuple[Text],
-    provided_properties: Set[Text],
-    component_name: Text,
-) -> None:
-    """Validates that at least one of the given required properties is present."""
+    for i, component in enumerate(pipeline):
+        if hasattr(component, "provides"):
+            raise_warning(
+                f"'{component.name}' contains property 'provides', "
+                f"which is deprecated. There is no need to specify "
+                f"the list of attributes that a component provides.",
+                category=FutureWarning,
+                docs="https://rasa.com/docs/rasa/migration-guide/",
+            )
+        if hasattr(component, "requires"):
+            raise_warning(
+                f"'{component.name}' contains property 'requires', "
+                f"which is deprecated. Use 'required_components()' method "
+                f"to specify which components are required to be present "
+                f"in the pipeline by this component.",
+                category=FutureWarning,
+                docs="https://rasa.com/docs/rasa/migration-guide/",
+            )
 
-    property_present = any(
-        [property in provided_properties for property in required_properties]
-    )
+        missing_components = []
+        for required_component in component.required_components():
+            if not _required_component_in_pipeline(
+                required_component, pipeline[: i + 1]
+            ):
+                missing_components.append(required_component.name)
 
-    if not property_present:
-        raise InvalidConfigError(
-            f"Failed to validate component '{component_name}'. "
-            f"Missing one of the following properties: "
-            f"{required_properties}."
-        )
+        if missing_components:
+            raise InvalidConfigError(
+                f"'{component.name}' requires {missing_components}. "
+                f"Add required components to the pipeline."
+            )
 
 
 def validate_required_components_from_data(
@@ -147,10 +119,12 @@ def validate_required_components_from_data(
 ) -> None:
     """Validates that all components are present in the pipeline based on data."""
 
+    from rasa.nlu.selectors.response_selector import ResponseSelector
+
     response_selector_exists = False
     for component in pipeline:
         # check if a response selector is part of NLU pipeline
-        if RESPONSE in component.provides:
+        if isinstance(component, ResponseSelector):
             response_selector_exists = True
 
     if len(data.response_examples) and not response_selector_exists:
@@ -237,25 +211,13 @@ def name(self):
 
         return type(self).name
 
-    # Defines what attributes the pipeline component will
-    # provide when called. The listed attributes
-    # should be set by the component on the message object
-    # during test and train, e.g.
-    # ```message.set("entities", [...])```
-    provides = []
-
-    # Which attributes on a message are required by this
-    # component. E.g. if requires contains "tokens", than a
-    # previous component in the pipeline needs to have "tokens"
-    # within the above described `provides` property.
-    # Use `any_of("option_1", "option_2")` to define that either
-    # "option_1" or "option_2" needs to be present in the
-    # provided properties from the previous components.
-    requires = []
-
     # Which components are required by this component.
     # Listed components should appear before the component itself in the pipeline.
-    required_components = []
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        """Specify which components need to be present in the pipeline."""
+
+        return []
 
     # Defines the default configuration parameters of a component
     # these values can be overwritten in the pipeline configuration
diff --git a/rasa/nlu/extractors/__init__.py b/rasa/nlu/extractors/__init__.py
index bcdf16ffb366..e69de29bb2d1 100644
--- a/rasa/nlu/extractors/__init__.py
+++ b/rasa/nlu/extractors/__init__.py
@@ -1,90 +0,0 @@
-from typing import Any, Dict, List, Text, Tuple
-
-from rasa.nlu.components import Component
-from rasa.nlu.constants import EXTRACTOR, ENTITIES
-from rasa.nlu.training_data import Message
-
-
-class EntityExtractor(Component):
-    def add_extractor_name(
-        self, entities: List[Dict[Text, Any]]
-    ) -> List[Dict[Text, Any]]:
-        for entity in entities:
-            entity[EXTRACTOR] = self.name
-        return entities
-
-    def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
-        if "processors" in entity:
-            entity["processors"].append(self.name)
-        else:
-            entity["processors"] = [self.name]
-
-        return entity
-
-    @staticmethod
-    def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
-        """Only return dimensions the user configured"""
-
-        if requested_dimensions:
-            return [
-                entity
-                for entity in extracted
-                if entity["entity"] in requested_dimensions
-            ]
-        else:
-            return extracted
-
-    @staticmethod
-    def find_entity(ent, text, tokens) -> Tuple[int, int]:
-        offsets = [token.start for token in tokens]
-        ends = [token.end for token in tokens]
-
-        if ent["start"] not in offsets:
-            message = (
-                "Invalid entity {} in example '{}': "
-                "entities must span whole tokens. "
-                "Wrong entity start.".format(ent, text)
-            )
-            raise ValueError(message)
-
-        if ent["end"] not in ends:
-            message = (
-                "Invalid entity {} in example '{}': "
-                "entities must span whole tokens. "
-                "Wrong entity end.".format(ent, text)
-            )
-            raise ValueError(message)
-
-        start = offsets.index(ent["start"])
-        end = ends.index(ent["end"]) + 1
-        return start, end
-
-    def filter_trainable_entities(
-        self, entity_examples: List[Message]
-    ) -> List[Message]:
-        """Filters out untrainable entity annotations.
-
-        Creates a copy of entity_examples in which entities that have
-        `extractor` set to something other than
-        self.name (e.g. 'CRFEntityExtractor') are removed.
-        """
-
-        filtered = []
-        for message in entity_examples:
-            entities = []
-            for ent in message.get(ENTITIES, []):
-                extractor = ent.get(EXTRACTOR)
-                if not extractor or extractor == self.name:
-                    entities.append(ent)
-            data = message.data.copy()
-            data[ENTITIES] = entities
-            filtered.append(
-                Message(
-                    text=message.text,
-                    data=data,
-                    output_properties=message.output_properties,
-                    time=message.time,
-                )
-            )
-
-        return filtered
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 48d23467e3a4..851dab60da78 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -7,7 +7,8 @@
 import rasa.nlu.utils.bilou_utils as bilou_utils
 import rasa.utils.common as common_utils
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
@@ -29,10 +30,9 @@ class CRFToken(NamedTuple):
 
 
 class CRFEntityExtractor(EntityExtractor):
-
-    provides = [ENTITIES]
-
-    requires = [TOKENS_NAMES[TEXT]]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Tokenizer]
 
     defaults = {
         # BILOU_flag determines whether to use BILOU tagging or not.
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
index dbc3327286c2..a783faff7154 100644
--- a/rasa/nlu/extractors/duckling_http_extractor.py
+++ b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -8,7 +8,7 @@
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.constants import ENTITIES
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
 from rasa.utils.common import raise_warning
@@ -53,8 +53,6 @@ def convert_duckling_format_to_rasa(
 class DucklingHTTPExtractor(EntityExtractor):
     """Searches for structured entites, e.g. dates, using a duckling server."""
 
-    provides = [ENTITIES]
-
     defaults = {
         # by default all dimensions recognized by duckling are returned
         # dimensions can be configured to contain an array of strings
diff --git a/rasa/nlu/extractors/entity_synonyms.py b/rasa/nlu/extractors/entity_synonyms.py
index 88ce0f94cf37..5d30d7d16d95 100644
--- a/rasa/nlu/extractors/entity_synonyms.py
+++ b/rasa/nlu/extractors/entity_synonyms.py
@@ -4,7 +4,7 @@
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 from rasa.nlu.constants import ENTITIES
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.utils import write_json_to_file
@@ -13,9 +13,6 @@
 
 
 class EntitySynonymMapper(EntityExtractor):
-
-    provides = [ENTITIES]
-
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
new file mode 100644
index 000000000000..bcdf16ffb366
--- /dev/null
+++ b/rasa/nlu/extractors/extractor.py
@@ -0,0 +1,90 @@
+from typing import Any, Dict, List, Text, Tuple
+
+from rasa.nlu.components import Component
+from rasa.nlu.constants import EXTRACTOR, ENTITIES
+from rasa.nlu.training_data import Message
+
+
+class EntityExtractor(Component):
+    def add_extractor_name(
+        self, entities: List[Dict[Text, Any]]
+    ) -> List[Dict[Text, Any]]:
+        for entity in entities:
+            entity[EXTRACTOR] = self.name
+        return entities
+
+    def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
+        if "processors" in entity:
+            entity["processors"].append(self.name)
+        else:
+            entity["processors"] = [self.name]
+
+        return entity
+
+    @staticmethod
+    def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
+        """Only return dimensions the user configured"""
+
+        if requested_dimensions:
+            return [
+                entity
+                for entity in extracted
+                if entity["entity"] in requested_dimensions
+            ]
+        else:
+            return extracted
+
+    @staticmethod
+    def find_entity(ent, text, tokens) -> Tuple[int, int]:
+        offsets = [token.start for token in tokens]
+        ends = [token.end for token in tokens]
+
+        if ent["start"] not in offsets:
+            message = (
+                "Invalid entity {} in example '{}': "
+                "entities must span whole tokens. "
+                "Wrong entity start.".format(ent, text)
+            )
+            raise ValueError(message)
+
+        if ent["end"] not in ends:
+            message = (
+                "Invalid entity {} in example '{}': "
+                "entities must span whole tokens. "
+                "Wrong entity end.".format(ent, text)
+            )
+            raise ValueError(message)
+
+        start = offsets.index(ent["start"])
+        end = ends.index(ent["end"]) + 1
+        return start, end
+
+    def filter_trainable_entities(
+        self, entity_examples: List[Message]
+    ) -> List[Message]:
+        """Filters out untrainable entity annotations.
+
+        Creates a copy of entity_examples in which entities that have
+        `extractor` set to something other than
+        self.name (e.g. 'CRFEntityExtractor') are removed.
+        """
+
+        filtered = []
+        for message in entity_examples:
+            entities = []
+            for ent in message.get(ENTITIES, []):
+                extractor = ent.get(EXTRACTOR)
+                if not extractor or extractor == self.name:
+                    entities.append(ent)
+            data = message.data.copy()
+            data[ENTITIES] = entities
+            filtered.append(
+                Message(
+                    text=message.text,
+                    data=data,
+                    output_properties=message.output_properties,
+                    time=message.time,
+                )
+            )
+
+        return filtered
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 561bca7d74e1..b78bcdad79cd 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -5,9 +5,10 @@
 
 from rasa.nlu.constants import ENTITIES, TOKENS_NAMES, TEXT
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.utils.mitie_utils import MitieNLP
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
 
@@ -18,10 +19,9 @@
 
 
 class MitieEntityExtractor(EntityExtractor):
-
-    provides = [ENTITIES]
-
-    requires = [TOKENS_NAMES[TEXT], "mitie_feature_extractor", "mitie_file"]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [MitieNLP, Tokenizer]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None, ner=None):
         """Construct a new intent classifier using the sklearn framework."""
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
index 2a90c0bd6f63..53be4f389eaa 100644
--- a/rasa/nlu/extractors/spacy_entity_extractor.py
+++ b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -2,7 +2,8 @@
 from typing import Any, Dict, List, Text, Optional
 
 from rasa.nlu.constants import ENTITIES
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.training_data import Message
 
 if typing.TYPE_CHECKING:
@@ -10,10 +11,9 @@
 
 
 class SpacyEntityExtractor(EntityExtractor):
-
-    provides = [ENTITIES]
-
-    requires = ["spacy_nlp"]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [SpacyNLP]
 
     defaults = {
         # by default all dimensions recognized by spacy are returned
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index a497c91a6a61..65a75ad0606a 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -4,7 +4,7 @@
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
@@ -23,7 +23,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ConveRTFeaturizer(Featurizer):
+class ConveRTFeaturizer(DenseFeaturizer):
     """Featurizer using ConveRT model.
 
         Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
@@ -31,13 +31,9 @@ class ConveRTFeaturizer(Featurizer):
         for dense featurizable attributes of each message object.
     """
 
-    provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    requires = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
-
-    required_components = [ConveRTTokenizer.name]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [ConveRTTokenizer]
 
     def _load_from_tfhub(self, model_url: Text):
         """Load model from TFHub"""
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index ce8d76073750..234d7d5d1dbb 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -1,39 +1,31 @@
 import numpy as np
-from typing import Any, Optional, Text
+from typing import Any, Optional, Text, List
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.training_data import Message, TrainingData
-
 from rasa.nlu.constants import (
     TEXT,
     LANGUAGE_MODEL_DOCS,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
-    TOKENS_NAMES,
     SEQUENCE_FEATURES,
     SENTENCE_FEATURES,
 )
 
 
-class LanguageModelFeaturizer(Featurizer):
+class LanguageModelFeaturizer(DenseFeaturizer):
     """Featurizer using transformer based language models.
 
         Uses the output of HFTransformersNLP component to set the sequence and sentence
         level representations for dense featurizable attributes of each message object.
     """
 
-    provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    requires = [
-        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
-
-    required_components = [HFTransformersNLP.name, LanguageModelTokenizer.name]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [HFTransformersNLP, LanguageModelTokenizer]
 
     def train(
         self,
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 1bbab2d66af9..2f743ee1b8eb 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -3,15 +3,10 @@
 from typing import Any, List, Text, Optional, Dict
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
-from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
-from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.training_data import Message, TrainingData
-
-if typing.TYPE_CHECKING:
-    import mitie
-
 from rasa.nlu.constants import (
     TEXT,
     TOKENS_NAMES,
@@ -19,18 +14,14 @@
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
+if typing.TYPE_CHECKING:
+    import mitie
 
-class MitieFeaturizer(Featurizer):
-
-    provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    requires = [
-        TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ] + ["mitie_feature_extractor"]
 
-    required_components = [MitieNLP.name]
+class MitieFeaturizer(DenseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [MitieNLP, Tokenizer]
 
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 378e715dca9c..0f2ca6ade5e3 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -1,36 +1,27 @@
 import numpy as np
 import typing
-from typing import Any, Optional, Text, Dict
+from typing import Any, Optional, Text, Dict, List
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message, TrainingData
-
-if typing.TYPE_CHECKING:
-    from spacy.tokens import Doc
-
 from rasa.nlu.constants import (
     TEXT,
     SPACY_DOCS,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
-    TOKENS_NAMES,
 )
 
+if typing.TYPE_CHECKING:
+    from spacy.tokens import Doc
 
-class SpacyFeaturizer(Featurizer):
-
-    provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    requires = [
-        SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
-    required_components = [SpacyNLP.name, SpacyTokenizer.name]
+class SpacyFeaturizer(DenseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [SpacyNLP, SpacyTokenizer]
 
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index f4a1e9b474eb..2c814e4cc9a0 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -24,6 +24,10 @@ def sequence_to_sentence_features(
 
 
 class Featurizer(Component):
+    pass
+
+
+class DenseFeaturizer(Featurizer):
     @staticmethod
     def _combine_with_existing_dense_features(
         message: Message,
@@ -45,6 +49,29 @@ def _combine_with_existing_dense_features(
         else:
             return additional_features
 
+    @staticmethod
+    def _calculate_cls_vector(
+        features: np.ndarray, pooling_operation: Text
+    ) -> np.ndarray:
+        # take only non zeros feature vectors into account
+        non_zero_features = np.array([f for f in features if f.any()])
+
+        # if features are all zero just return a vector with all zeros
+        if non_zero_features.size == 0:
+            return np.zeros([1, features.shape[-1]])
+
+        if pooling_operation == "mean":
+            return np.mean(non_zero_features, axis=0, keepdims=True)
+        elif pooling_operation == "max":
+            return np.max(non_zero_features, axis=0, keepdims=True)
+        else:
+            raise ValueError(
+                f"Invalid pooling operation specified. Available operations are "
+                f"'mean' or 'max', but provided value is '{pooling_operation}'."
+            )
+
+
+class SparseFeaturizer(Featurizer):
     @staticmethod
     def _combine_with_existing_sparse_features(
         message: Message,
@@ -66,24 +93,3 @@ def _combine_with_existing_sparse_features(
             return hstack([message.get(feature_name), additional_features])
         else:
             return additional_features
-
-    @staticmethod
-    def _calculate_cls_vector(
-        features: np.ndarray, pooling_operation: Text
-    ) -> np.ndarray:
-        # take only non zeros feature vectors into account
-        non_zero_features = np.array([f for f in features if f.any()])
-
-        # if features are all zero just return a vector with all zeros
-        if non_zero_features.size == 0:
-            return np.zeros([1, features.shape[-1]])
-
-        if pooling_operation == "mean":
-            return np.mean(non_zero_features, axis=0, keepdims=True)
-        elif pooling_operation == "max":
-            return np.max(non_zero_features, axis=0, keepdims=True)
-        else:
-            raise ValueError(
-                f"Invalid pooling operation specified. Available operations are "
-                f"'mean' or 'max', but provided value is '{pooling_operation}'."
-            )
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 8e4cc21baa02..325e8876f185 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -10,7 +10,8 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
@@ -26,7 +27,7 @@
 logger = logging.getLogger(__name__)
 
 
-class CountVectorsFeaturizer(Featurizer):
+class CountVectorsFeaturizer(SparseFeaturizer):
     """Creates a sequence of token counts features based on sklearn's `CountVectorizer`.
 
     All tokens which consist only of digits (e.g. 123 and 99
@@ -37,9 +38,9 @@ class CountVectorsFeaturizer(Featurizer):
     from https://arxiv.org/abs/1810.07150.
     """
 
-    provides = [SPARSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
-    requires = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Tokenizer]
 
     defaults = {
         # whether to use a shared vocab
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index aad615727f98..8f176e75dfd5 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -9,7 +9,8 @@
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES
@@ -19,11 +20,10 @@
 logger = logging.getLogger(__name__)
 
 
-class LexicalSyntacticFeaturizer(Featurizer):
-
-    provides = [SPARSE_FEATURE_NAMES[TEXT]]
-
-    requires = [TOKENS_NAMES[TEXT]]
+class LexicalSyntacticFeaturizer(SparseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Tokenizer]
 
     defaults = {
         # 'features' is [before, word, after] array with before, word,
diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index 60badfb41dc1..5c80f9d803a8 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -2,12 +2,12 @@
 
 from typing import Any, Dict, Optional, Text
 
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 
 logger = logging.getLogger(__name__)
 
 
-class NGramFeaturizer(Featurizer):
+class NGramFeaturizer(SparseFeaturizer):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(NGramFeaturizer, self).__init__(component_config)
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 2709008bad00..9bec70681f52 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -18,7 +18,8 @@
     TEXT,
     TOKENS_NAMES,
 )
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.training_data import Message, TrainingData
 import rasa.utils.common as common_utils
 from rasa.nlu.model import Metadata
@@ -26,11 +27,10 @@
 logger = logging.getLogger(__name__)
 
 
-class RegexFeaturizer(Featurizer):
-
-    provides = [SPARSE_FEATURE_NAMES[TEXT]]
-
-    requires = [TOKENS_NAMES[TEXT]]
+class RegexFeaturizer(SparseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Tokenizer]
 
     def __init__(
         self,
diff --git a/rasa/nlu/model.py b/rasa/nlu/model.py
index 185ccc084baf..008078426776 100644
--- a/rasa/nlu/model.py
+++ b/rasa/nlu/model.py
@@ -179,7 +179,6 @@ def train(self, data: TrainingData, **kwargs: Any) -> "Interpreter":
 
         # Before the training starts: check that all arguments are provided
         if not self.skip_validation:
-            components.validate_arguments(self.pipeline, context)
             components.validate_required_components_from_data(
                 self.pipeline, self.training_data
             )
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 49c4710a6967..7d4593ab05c7 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -3,11 +3,11 @@
 import numpy as np
 import tensorflow as tf
 
-from typing import Any, Dict, Optional, Text, Tuple, Union
+from typing import Any, Dict, Optional, Text, Tuple, Union, List
 
 from rasa.nlu.training_data import TrainingData, Message
+from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
-from rasa.nlu.components import any_of
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -52,9 +52,7 @@
     RESPONSE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
     DEFAULT_OPEN_UTTERANCE_TYPE,
-    DENSE_FEATURE_NAMES,
     TEXT,
-    SPARSE_FEATURE_NAMES,
 )
 from rasa.utils.tensorflow.model_data import RasaModelData
 from rasa.utils.tensorflow.models import RasaModel
@@ -81,12 +79,9 @@ class ResponseSelector(DIETClassifier):
     and additional hidden layers are added together with dropout.
     """
 
-    provides = [RESPONSE, "response_ranking"]
-
-    requires = [
-        any_of(DENSE_FEATURE_NAMES[TEXT], SPARSE_FEATURE_NAMES[TEXT]),
-        any_of(DENSE_FEATURE_NAMES[RESPONSE], SPARSE_FEATURE_NAMES[RESPONSE]),
-    ]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [Featurizer]
 
     defaults = {
         # ## Architecture of the used neural network
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 383317b33b96..d83255c9cbb6 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -11,13 +11,11 @@
 class ConveRTTokenizer(WhitespaceTokenizer):
     """Tokenizer using ConveRT model.
 
-        Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
-        model from TFHub and computes sub-word tokens for dense
-        featurizable attributes of each message object.
+    Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
+    model from TFHub and computes sub-word tokens for dense
+    featurizable attributes of each message object.
     """
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 6cf2af11f45e..59dd9425a404 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -20,8 +20,6 @@
 
 class JiebaTokenizer(Tokenizer):
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     language_list = ["zh"]
 
     defaults = {
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index d6210776ce8c..0c6ca9ee215e 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -20,13 +20,9 @@ class LanguageModelTokenizer(Tokenizer):
     for dense featurizable attributes of each message object.
     """
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
-    requires = [
-        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
-
-    required_components = [HFTransformersNLP.name]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [HFTransformersNLP]
 
     defaults = {
         # Flag to check whether to split intents
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 1a9e4d34c980..054e3225fb10 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -9,8 +9,6 @@
 
 class MitieTokenizer(Tokenizer):
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 08ef77cfbdfa..65a8788abb20 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,12 +17,9 @@
 
 
 class SpacyTokenizer(Tokenizer):
-
-    provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
-
-    requires = [SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
-
-    required_components = [SpacyNLP.name]
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        return [SpacyNLP]
 
     defaults = {
         # Flag to check whether to split intents
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 4b7c7253d2bd..85ad4d07bf0d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -8,8 +8,6 @@
 
 class WhitespaceTokenizer(Tokenizer):
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index a26955515cd3..d0522fc63f43 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -31,10 +31,6 @@ class HFTransformersNLP(Component):
     message.
     """
 
-    provides = [
-        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
-
     defaults = {
         # name of the language model to load.
         "model_name": "bert",
diff --git a/rasa/nlu/utils/mitie_utils.py b/rasa/nlu/utils/mitie_utils.py
index 2dfaa0202d72..91d37cc392d7 100644
--- a/rasa/nlu/utils/mitie_utils.py
+++ b/rasa/nlu/utils/mitie_utils.py
@@ -12,8 +12,6 @@
 
 class MitieNLP(Component):
 
-    provides = ["mitie_feature_extractor", "mitie_file"]
-
     defaults = {
         # name of the language model to load - this contains
         # the MITIE feature extractor
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 25fd7e0e9f24..3eae015409d1 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -18,9 +18,6 @@
 
 
 class SpacyNLP(Component):
-    provides = ["spacy_nlp"] + [
-        SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
 
     defaults = {
         # name of the language model to load - if it is not set
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 8398821363bd..9903e7ce5688 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -128,7 +128,8 @@ def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
         x.shape =
         (batch, num_heads, length, relative_length, depth)
         or (batch, num_heads, length, relative_length)
-        "Slides" relative embeddings by 45 degree """
+        "Slides" relative embeddings by 45 degree.
+        """
 
         x_dim = len(x.shape)
 
diff --git a/tests/nlu/example_component.py b/tests/nlu/example_component.py
index fea264ca9996..3fb3a6a4e40f 100644
--- a/tests/nlu/example_component.py
+++ b/tests/nlu/example_component.py
@@ -1,6 +1,9 @@
-from rasa.nlu.components import Component
 import typing
-from typing import Any, Optional, Text, Dict
+from typing import Any, Optional, Text, Dict, List
+
+from rasa.nlu.components import Component
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.model import Metadata
@@ -9,21 +12,13 @@
 class MyComponent(Component):
     """A new component"""
 
-    # Defines what attributes the pipeline component will
-    # provide when called. The listed attributes
-    # should be set by the component on the message object
-    # during test and train, e.g.
-    # ```message.set("entities", [...])```
-    provides = []
-
-    # Which attributes on a message are required by this
-    # component. E.g. if requires contains "tokens", than a
-    # previous component in the pipeline needs to have "tokens"
-    # within the above described `provides` property.
-    # Use `any_of("option_1", "option_2")` to define that either
-    # "option_1" or "option_2" needs to be present in the
-    # provided properties from the previous components.
-    requires = []
+    # Which components are required by this component.
+    # Listed components should appear before the component itself in the pipeline.
+    @classmethod
+    def required_components(cls) -> List[Any]:
+        """Specify which components need to be present in the pipeline."""
+
+        return []
 
     # Defines the default configuration parameters of a component
     # these values can be overwritten in the pipeline configuration
@@ -37,10 +32,15 @@ class MyComponent(Component):
     # This is an important feature for backwards compatibility of components.
     language_list = None
 
-    def __init__(self, component_config=None):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super().__init__(component_config)
 
-    def train(self, training_data, cfg, **kwargs):
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
         """Train this component.
 
         This is the components chance to train itself provided
@@ -53,7 +53,7 @@ def train(self, training_data, cfg, **kwargs):
         of components previous to this one."""
         pass
 
-    def process(self, message, **kwargs):
+    def process(self, message: Message, **kwargs: Any) -> None:
         """Process an incoming message.
 
         This is the components chance to process an incoming
diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py
index 38b0c9a05859..bd81b1b08681 100644
--- a/tests/nlu/test_config.py
+++ b/tests/nlu/test_config.py
@@ -8,7 +8,6 @@
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.registry import registered_pipeline_templates
 from rasa.nlu.model import Trainer
-from rasa.nlu.training_data.training_data import TrainingData
 from tests.nlu.utilities import write_file_config
 
 
@@ -76,8 +75,8 @@ def test_invalid_requred_components_in_config():
     assert "Add required components to the pipeline" in str(execinfo.value)
 
     with pytest.raises(config.InvalidConfigError) as execinfo:
-        Trainer(config.RasaNLUModelConfig(count_vectors_config)).train(TrainingData())
-    assert "Missing property" in str(execinfo.value)
+        Trainer(config.RasaNLUModelConfig(count_vectors_config))
+    assert "Add required components to the pipeline" in str(execinfo.value)
 
 
 @pytest.mark.parametrize(

From 6e76414633a1c25fb0c5391287feeeccdb76e1a9 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 21 Feb 2020 21:56:10 +0100
Subject: [PATCH 536/633] fix entity extractor import

---
 tests/nlu/test_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index 03e900d8d48d..ffdf1c95ba32 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -9,7 +9,7 @@
 import rasa.utils.io
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 from rasa.test import compare_nlu_models
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
 from rasa.nlu.model import Interpreter

From 9d0785287146d48f96e36b14e43acf41cea676ed Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 08:56:25 +0100
Subject: [PATCH 537/633] fix loading TED policy

---
 rasa/core/policies/ted_policy.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 2997cda1e389..40912404ef27 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -368,16 +368,16 @@ def persist(self, path: Text) -> None:
         self.model.save(str(tf_model_file))
 
         io_utils.json_pickle(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.json", self.priority
+            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl", self.priority
         )
-        io_utils.json_pickle(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.meta.json", self.config
+        io_utils.pickle_dump(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl", self.config
         )
         io_utils.json_pickle(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.json", self.data_example
+            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl", self.data_example
         )
         io_utils.json_pickle(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.json", self._label_data
+            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl", self._label_data
         )
 
     @classmethod
@@ -407,7 +407,7 @@ def load(cls, path: Text) -> "TEDPolicy":
         label_data = io_utils.json_unpickle(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
         )
-        meta = io_utils.json_unpickle(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
+        meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
         priority = io_utils.json_unpickle(
             model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
         )

From 4f8a7ad7d51f065cd0c185df4e5491b8636a2998 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 09:48:42 +0100
Subject: [PATCH 538/633] check if tag id dict exists.

---
 rasa/nlu/classifiers/diet_classifier.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 07225d9128b4..a657b748f06f 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -872,9 +872,10 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
         inverted_label_dict = {
             int(key): value for key, value in inverted_label_dict.items()
         }
-        inverted_tag_dict = {
-            int(key): value for key, value in inverted_tag_dict.items()
-        }
+        if inverted_tag_dict is not None:
+            inverted_tag_dict = {
+                int(key): value for key, value in inverted_tag_dict.items()
+            }
 
         return (
             batch_tuple_sizes,

From d12ff0a3a4def11e32dd18710bb67756bae7be82 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 11:15:51 +0100
Subject: [PATCH 539/633] update docstrings in components.py

---
 rasa/nlu/classifiers/diet_classifier.py       |   1 -
 rasa/nlu/components.py                        | 118 ++++++++++++++++--
 .../dense_featurizer/lm_featurizer.py         |   2 +-
 rasa/nlu/utils/bilou_utils.py                 |   8 +-
 4 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index a657b748f06f..59240fc4e400 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -72,7 +72,6 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
-    EVALUATE_ONCE_PER_EPOCH,
 )
 
 
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index e80d9936914c..d1c1c5d66e74 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -14,7 +14,14 @@
 
 
 def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
-    """Tries to import all package names and returns the packages where it failed."""
+    """Tries to import all package names and returns the packages where it failed.
+
+    Args:
+        package_names: the package names to import
+
+    Returns:
+        package names that could not be imported
+    """
 
     import importlib
 
@@ -52,7 +59,12 @@ def validate_requirements(component_names: List[Text]) -> None:
 
 
 def validate_tokenizers(pipeline: List["Component"]) -> None:
-    """Validates that only one tokenizer is present in the pipeline."""
+    """Validates that only one tokenizer is present in the pipeline.
+
+    Args:
+        pipeline: the list of components in the pipeline
+
+    """
 
     from rasa.nlu.tokenizers.tokenizer import Tokenizer
 
@@ -69,7 +81,12 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
 
 
 def validate_required_components(pipeline: List["Component"]) -> None:
-    """Validates that all required components are present in the pipeline."""
+    """Validates that all required components are present in the pipeline.
+
+    Args:
+        pipeline: the list of components in the pipeline
+
+    """
 
     unique_component_names = set()
     for component in pipeline:
@@ -86,7 +103,16 @@ def validate_arguments(
     context: Dict[Text, Any],
     allow_empty_pipeline: bool = False,
 ) -> None:
-    """Validates that all arguments are present to train the pipeline."""
+    """Validates that all arguments are present to train the pipeline.
+
+    Args:
+        pipeline: the list of components in the pipeline
+        context: the component context
+        allow_empty_pipeline: whether to allow an empty pipeline or not
+
+    Returns:
+        `True` if all arguments are valid, `False` otherwise
+    """
 
     # Ensure the pipeline is not empty
     if not allow_empty_pipeline and len(pipeline) == 0:
@@ -145,7 +171,13 @@ def validate_requires_any_of(
 def validate_required_components_from_data(
     pipeline: List["Component"], data: TrainingData
 ) -> None:
-    """Validates that all components are present in the pipeline based on data."""
+    """Validates that all components are present in the pipeline based on data.
+
+    Args:
+        pipeline: the list of components in the pipeline
+        data: the :class:`rasa.nlu.training_data.TrainingData`
+
+    """
 
     response_selector_exists = False
     for component in pipeline:
@@ -293,6 +325,9 @@ def required_packages(cls) -> List[Text]:
 
         This list of requirements allows us to fail early during training
         if a required package is not installed.
+
+        Returns:
+            a list of required packages
         """
 
         return []
@@ -314,6 +349,15 @@ def load(
         Components can rely on any context attributes that are
         created by :meth:`components.Component.create`
         calls to components previous to this one.
+
+        Args:
+            meta: any configuration parameter related to the model
+            model_dir: the directory to load the component from
+            model_metadata: the model's :class:`rasa.nlu.model.Metadata`
+            cached_component: the cached component
+
+        Returns:
+            the loaded component
         """
 
         if cached_component:
@@ -327,7 +371,15 @@ def create(
     ) -> "Component":
         """Creates this component (e.g. before a training is started).
 
-        Method can access all configuration parameters."""
+        Method can access all configuration parameters.
+
+        Args:
+            component_config: the components configuration parameters
+            config: the model configuration parameters
+
+        Returns:
+            the created component
+        """
 
         # Check language supporting
         language = config.language
@@ -349,6 +401,9 @@ def provide_context(self) -> Optional[Dict[Text, Any]]:
         It's mostly used to initialize framework environments
         like MITIE and spacy
         (e.g. loading word vectors for the pipeline).
+
+        Returns:
+            the updated component configuration
         """
 
         pass
@@ -369,6 +424,11 @@ def train(
         on any context attributes created by a call to
         :meth:`rasa.nlu.components.Component.train`
         of components previous to this one.
+
+        Args:
+            training_data: the :class:`rasa.nlu.training_data.TrainingData`
+            config: the model configuration parameters
+
         """
 
         pass
@@ -384,12 +444,24 @@ def process(self, message: Message, **kwargs: Any) -> None:
         on any context attributes created by a call to
         :meth:`rasa.nlu.components.Component.process`
         of components previous to this one.
+
+        Args:
+            message: the :class:`rasa.nlu.training_data.Message` to process
+
         """
 
         pass
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
-        """Persist this component to disk for future loading."""
+        """Persist this component to disk for future loading.
+
+        Args:
+            file_name: the file name of the model
+            model_dir: the directory to store the model to
+
+        Returns:
+            an optional dictionary with any information about the stored model
+        """
 
         pass
 
@@ -403,6 +475,13 @@ def cache_key(
         Otherwise, an instantiation of the
         component will be reused for all models where the
         metadata creates the same key.
+
+        Args:
+            component_meta: the component configuration
+            model_metadata: the component's :class:`rasa.nlu.model.Metadata`
+
+        Returns:
+            a unique caching key
         """
 
         return None
@@ -428,6 +507,11 @@ def prepare_partial_processing(
         previous to this one in the pipeline and
         have already finished their training (and can therefore
         be safely used to process messages).
+
+        Args:
+            pipeline: the list of components
+            context: the context of processing
+
         """
 
         self.partial_processing_pipeline = pipeline
@@ -439,6 +523,13 @@ def partially_process(self, message: Message) -> Message:
 
         The passed message will be processed by all components
         previous to this one in the pipeline.
+
+        Args:
+            message: the :class:`rasa.nlu.training_data.Message` to process
+
+        Returns:
+            the processed :class:`rasa.nlu.training_data.Message`
+
         """
 
         if self.partial_processing_context is not None:
@@ -454,6 +545,12 @@ def can_handle_language(cls, language: Hashable) -> bool:
 
         This method can be overwritten when needed. (e.g. dynamically
         determine which language is supported.)
+
+        Args:
+            language: the language to check
+
+        Returns:
+            `True` if component can handle specific language, `False` otherwise
         """
 
         # if language_list is set to `None` it means: support all languages
@@ -558,6 +655,13 @@ def create_component(
 
         Tries to retrieve a component from the cache,
         calls `create` to create a new component.
+
+        Args:
+            component_config: the component configuration
+            cfg: the model configuration
+
+        Returns:
+            the created component
         """
 
         from rasa.nlu import registry
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index ce8d76073750..5a5f572033a7 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -54,7 +54,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_lm_features(message)
 
-    def _set_lm_features(self, message: Message, attribute: Text = TEXT):
+    def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None:
         """Adds the precomputed word vectors to the messages features."""
 
         doc = self.get_doc(message, attribute)
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 1e8c42612170..0d1490f40eee 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -120,12 +120,12 @@ def _handle_entities(
         # Only interested if the tokenization is correct
         if start_token_idx is not None and end_token_idx is not None:
             if start_token_idx == end_token_idx:
-                bilou[start_token_idx] = "U-%s" % label
+                bilou[start_token_idx] = f"U-{label}"
             else:
-                bilou[start_token_idx] = "B-%s" % label
+                bilou[start_token_idx] = f"B-{label}"
                 for i in range(start_token_idx + 1, end_token_idx):
-                    bilou[i] = "I-%s" % label
-                bilou[end_token_idx] = "L-%s" % label
+                    bilou[i] = f"I-{label}"
+                bilou[end_token_idx] = f"L-{label}"
 
 
 def _get_entity_positions(entities: List[Tuple[int, int, Text]]) -> Set[int]:

From befeac5fce5496072ec4fa707b8dab81ea65ffa6 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 11:23:23 +0100
Subject: [PATCH 540/633] add empty pipeline validation

---
 rasa/nlu/components.py | 61 ++++++++++++++++++++++++++++--------------
 rasa/nlu/model.py      |  3 +--
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 40767ab7c33d..2960a530e3c2 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -50,6 +50,19 @@ def validate_requirements(component_names: List[Text]) -> None:
         )
 
 
+def validate_empty_pipeline(pipeline: List["Component"]) -> None:
+    """Ensures the pipeline is not empty."""
+
+    if len(pipeline) == 0:
+        raise InvalidConfigError(
+            "Can not train an empty pipeline. "
+            "Make sure to specify a proper pipeline in "
+            "the configuration using the 'pipeline' key. "
+            "The 'backend' configuration key is "
+            "NOT supported anymore."
+        )
+
+
 def validate_tokenizers(pipeline: List["Component"]) -> None:
     """Validates that only one tokenizer is present in the pipeline."""
 
@@ -78,33 +91,35 @@ def _required_component_in_pipeline(
     return False
 
 
+def _check_deprecated_attributes(component: "Component") -> None:
+    if hasattr(component, "provides"):
+        raise_warning(
+            f"'{component.name}' contains property 'provides', "
+            f"which is deprecated. There is no need to specify "
+            f"the list of attributes that a component provides.",
+            category=FutureWarning,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
+        )
+    if hasattr(component, "requires"):
+        raise_warning(
+            f"'{component.name}' contains property 'requires', "
+            f"which is deprecated. Use 'required_components()' method "
+            f"to specify which components are required to be present "
+            f"in the pipeline by this component.",
+            category=FutureWarning,
+            docs="https://rasa.com/docs/rasa/migration-guide/",
+        )
+
+
 def validate_required_components(pipeline: List["Component"]) -> None:
     """Validates that all required components are present in the pipeline."""
 
     for i, component in enumerate(pipeline):
-        if hasattr(component, "provides"):
-            raise_warning(
-                f"'{component.name}' contains property 'provides', "
-                f"which is deprecated. There is no need to specify "
-                f"the list of attributes that a component provides.",
-                category=FutureWarning,
-                docs="https://rasa.com/docs/rasa/migration-guide/",
-            )
-        if hasattr(component, "requires"):
-            raise_warning(
-                f"'{component.name}' contains property 'requires', "
-                f"which is deprecated. Use 'required_components()' method "
-                f"to specify which components are required to be present "
-                f"in the pipeline by this component.",
-                category=FutureWarning,
-                docs="https://rasa.com/docs/rasa/migration-guide/",
-            )
+        _check_deprecated_attributes(component)
 
         missing_components = []
         for required_component in component.required_components():
-            if not _required_component_in_pipeline(
-                required_component, pipeline[: i + 1]
-            ):
+            if not _required_component_in_pipeline(required_component, pipeline[:i]):
                 missing_components.append(required_component.name)
 
         if missing_components:
@@ -114,6 +129,12 @@ def validate_required_components(pipeline: List["Component"]) -> None:
             )
 
 
+def validate_pipeline(pipeline: List["Component"]) -> None:
+    validate_empty_pipeline(pipeline)
+    validate_tokenizers(pipeline)
+    validate_required_components(pipeline)
+
+
 def validate_required_components_from_data(
     pipeline: List["Component"], data: TrainingData
 ) -> None:
diff --git a/rasa/nlu/model.py b/rasa/nlu/model.py
index 008078426776..0c86fe9137cc 100644
--- a/rasa/nlu/model.py
+++ b/rasa/nlu/model.py
@@ -158,8 +158,7 @@ def _build_pipeline(
             pipeline.append(component)
 
         if not self.skip_validation:
-            components.validate_tokenizers(pipeline)
-            components.validate_required_components(pipeline)
+            components.validate_pipeline(pipeline)
 
         return pipeline
 

From 4dd23d0a0994fc81512c8b90f2c60190096d5ba2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 12:43:23 +0100
Subject: [PATCH 541/633] fix refs in docstings

---
 rasa/nlu/components.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index d1c1c5d66e74..2d14da972d04 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -175,7 +175,7 @@ def validate_required_components_from_data(
 
     Args:
         pipeline: the list of components in the pipeline
-        data: the :class:`rasa.nlu.training_data.TrainingData`
+        data: the :class:`rasa.nlu.training_data.training_data.TrainingData`
 
     """
 
@@ -426,7 +426,7 @@ def train(
         of components previous to this one.
 
         Args:
-            training_data: the :class:`rasa.nlu.training_data.TrainingData`
+            training_data: the :class:`rasa.nlu.training_data.training_data.TrainingData`
             config: the model configuration parameters
 
         """
@@ -446,7 +446,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         of components previous to this one.
 
         Args:
-            message: the :class:`rasa.nlu.training_data.Message` to process
+            message: the :class:`rasa.nlu.training_data.message.Message` to process
 
         """
 
@@ -525,10 +525,10 @@ def partially_process(self, message: Message) -> Message:
         previous to this one in the pipeline.
 
         Args:
-            message: the :class:`rasa.nlu.training_data.Message` to process
+            message: the :class:`rasa.nlu.training_data.message.Message` to process
 
         Returns:
-            the processed :class:`rasa.nlu.training_data.Message`
+            the processed :class:`rasa.nlu.training_data.message.Message`
 
         """
 

From 287343c993cd09741b4c3f2644286dac56c82d3e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 13:05:03 +0100
Subject: [PATCH 542/633] change json_pickle to pickle_dump

---
 rasa/nlu/classifiers/diet_classifier.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 0dd0de584e5e..6d7ede3803ae 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -796,7 +796,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         io_utils.pickle_dump(
             model_dir / f"{file_name}.data_example.pkl", self.data_example
         )
-        io_utils.json_pickle(
+        io_utils.pickle_dump(
             model_dir / f"{file_name}.label_data.pkl", self._label_data
         )
         io_utils.json_pickle(
@@ -805,7 +805,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         io_utils.json_pickle(
             model_dir / f"{file_name}.inverted_tag_dict.pkl", self.inverted_tag_dict
         )
-        io_utils.pickle_dump(
+        io_utils.json_pickle(
             model_dir / f"{file_name}.batch_tuple_sizes.pkl", self.batch_tuple_sizes
         )
 
@@ -857,14 +857,14 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
         model_dir = Path(model_dir)
 
         data_example = io_utils.pickle_load(model_dir / f"{file_name}.data_example.pkl")
-        label_data = io_utils.json_unpickle(model_dir / f"{file_name}.label_data.pkl")
+        label_data = io_utils.pickle_load(model_dir / f"{file_name}.label_data.pkl")
         inverted_label_dict = io_utils.json_unpickle(
             model_dir / f"{file_name}.inverted_label_dict.pkl"
         )
         inverted_tag_dict = io_utils.json_unpickle(
             model_dir / f"{file_name}.inverted_tag_dict.pkl"
         )
-        batch_tuple_sizes = io_utils.pickle_load(
+        batch_tuple_sizes = io_utils.json_unpickle(
             model_dir / f"{file_name}.batch_tuple_sizes.pkl"
         )
 

From dc932b717c06da60348bf8cc3f80a92c8bd468af Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 13:33:01 +0100
Subject: [PATCH 543/633] remove all traces of component.required and provides

---
 rasa/nlu/test.py             | 33 +++++++++++++++++++++----------
 tests/nlu/test_components.py | 38 ++++++++++++++++--------------------
 2 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 13b99e7db927..c5b208519c13 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -994,43 +994,56 @@ def get_eval_data(
 
 def get_entity_extractors(interpreter: Interpreter) -> Set[Text]:
     """Finds the names of entity extractors used by the interpreter.
-    Processors are removed since they do not
-    detect the boundaries themselves."""
 
-    extractors = {c.name for c in interpreter.pipeline if "entities" in c.provides}
+    Processors are removed since they do not detect the boundaries themselves.
+    """
+
+    from rasa.nlu.extractors.extractor import EntityExtractor
+
+    extractors = {
+        c.name for c in interpreter.pipeline if isinstance(c, EntityExtractor)
+    }
     return extractors - ENTITY_PROCESSORS
 
 
 def is_entity_extractor_present(interpreter: Interpreter) -> bool:
-    """Checks whether entity extractor is present"""
+    """Checks whether entity extractor is present."""
 
     extractors = get_entity_extractors(interpreter)
     return extractors != []
 
 
 def is_intent_classifier_present(interpreter: Interpreter) -> bool:
-    """Checks whether intent classifier is present"""
+    """Checks whether intent classifier is present."""
+
+    from rasa.nlu.classifiers.classifier import IntentClassifier
 
     intent_classifiers = [
-        c.name for c in interpreter.pipeline if "intent" in c.provides
+        c.name for c in interpreter.pipeline if isinstance(c, IntentClassifier)
     ]
     return intent_classifiers != []
 
 
 def is_response_selector_present(interpreter: Interpreter) -> bool:
-    """Checks whether response selector is present"""
+    """Checks whether response selector is present."""
+
+    from rasa.nlu.selectors.response_selector import ResponseSelector
 
     response_selectors = [
-        c.name for c in interpreter.pipeline if "response" in c.provides
+        c.name for c in interpreter.pipeline if isinstance(c, ResponseSelector)
     ]
     return response_selectors != []
 
 
 def get_available_response_selector_types(interpreter: Interpreter) -> List[Text]:
-    """Gets all available response selector types"""
+    """Gets all available response selector types."""
+
+    from rasa.nlu.selectors.response_selector import ResponseSelector
 
     response_selector_types = [
-        c.retrieval_intent for c in interpreter.pipeline if "response" in c.provides
+        c.retrieval_intent
+        for c in interpreter.pipeline
+        if isinstance(c, ResponseSelector)
     ]
 
     return response_selector_types
diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py
index 44845d7460a3..bd86025e8b18 100644
--- a/tests/nlu/test_components.py
+++ b/tests/nlu/test_components.py
@@ -2,7 +2,10 @@
 
 from typing import Tuple
 from rasa.nlu import registry
-from rasa.nlu.components import find_unavailable_packages
+from rasa.nlu.components import (
+    find_unavailable_packages,
+    _required_component_in_pipeline,
+)
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from tests.nlu import utilities
@@ -34,26 +37,19 @@ def test_all_components_in_model_templates_exist(pipeline_template):
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
 def test_all_arguments_can_be_satisfied(component_class):
-    """Check that `train` method parameters can be filled
-    filled from the context. Similar to `pipeline_init` test."""
-
-    # All available context arguments that will ever be generated during train
-    # it might still happen, that in a certain pipeline
-    # configuration arguments can not be satisfied!
-    provided_properties = {
-        provided for c in registry.component_classes for provided in c.provides
-    }
-
-    for req in component_class.requires:
-        if isinstance(req, Tuple):
-            for r in req:
-                assert (
-                    r in provided_properties
-                ), "No component provides required property."
-        else:
-            assert (
-                req in provided_properties
-            ), "No component provides required property."
+    """Checks that all required_components are present in the registry."""
+
+    missing_components = []
+    for required_component in component_class.required_components():
+        if not _required_component_in_pipeline(
+            required_component, registry.component_classes
+        ):
+            missing_components.append(required_component.name)
+
+    assert missing_components == [], (
+        f"There is no required components {missing_components} "
+        f"for '{component_class.name}'."
+    )
 
 
 def test_find_unavailable_packages():

From 9e3b51b4db968a007bdbcbbb7a191bfec1c1ac61 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 13:34:52 +0100
Subject: [PATCH 544/633] rename test

---
 tests/nlu/test_components.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py
index bd86025e8b18..d79b3ad8b83f 100644
--- a/tests/nlu/test_components.py
+++ b/tests/nlu/test_components.py
@@ -36,7 +36,7 @@ def test_all_components_in_model_templates_exist(pipeline_template):
 
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied(component_class):
+def test_all_required_components_can_be_satisfied(component_class):
     """Checks that all required_components are present in the registry."""
 
     missing_components = []

From 0c281bbd601e446d6cedf702ec0a9734c6ae21e3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 14:41:58 +0100
Subject: [PATCH 545/633] force_download of HF model weights

---
 .../nlu/utils/hugging_face/hf_transformers.py | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index a26955515cd3..fc79433d5d8b 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -77,12 +77,22 @@ def _load_model(self) -> None:
             self.model_weights = model_weights_defaults[self.model_name]
 
         logger.debug(f"Loading Tokenizer and Model for {self.model_name}")
-        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
-            self.model_weights
-        )
-        self.model = model_class_dict[self.model_name].from_pretrained(
-            self.model_weights
-        )
+
+        try:
+            self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
+                self.model_weights
+            )
+            self.model = model_class_dict[self.model_name].from_pretrained(
+                self.model_weights
+            )
+        except OSError:
+            # if loading of models weights fail, try to download them again
+            self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
+                self.model_weights, force_download=True
+            )
+            self.model = model_class_dict[self.model_name].from_pretrained(
+                self.model_weights, force_download=True
+            )
 
         # Use a universal pad token since all transformer architectures do not have a
         # consistent token. Instead of pad_token_id we use unk_token_id because

From fe6b90ae5a1a2b9adb00cc0d7c8cffd1a0b77c4f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 15:31:39 +0100
Subject: [PATCH 546/633] add docstrings to Policy

---
 rasa/core/policies/policy.py | 79 +++++++++++++++++++++++++++++-------
 rasa/nlu/components.py       | 16 ++++----
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index 9a95ba044a40..7527a2489144 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -45,7 +45,15 @@ def featurizer(self):
 
     @staticmethod
     def _get_valid_params(func: Callable, **kwargs: Any) -> Dict:
-        # filter out kwargs that cannot be passed to func
+        """Filters out kwargs that cannot be passed to func.
+
+        Args:
+            func: a callable function
+
+        Returns:
+            the dictionary of parameters
+        """
+
         valid_keys = rasa.utils.common.arguments_of(func)
 
         params = {key: kwargs.get(key) for key in valid_keys if kwargs.get(key)}
@@ -62,8 +70,18 @@ def featurize_for_training(
         **kwargs: Any,
     ) -> DialogueTrainingData:
         """Transform training trackers into a vector representation.
+
         The trackers, consisting of multiple turns, will be transformed
-        into a float vector which can be used by a ML model."""
+        into a float vector which can be used by a ML model.
+
+        Args:
+            training_trackers:
+                the list of the :class:`rasa.core.trackers.DialogueStateTracker`
+            domain: the :class:`rasa.core.domain.Domain`
+
+        Returns:
+            the :class:`rasa.core.training.data.DialogueTrainingData`
+        """
 
         training_data = self.featurizer.featurize_trackers(training_trackers, domain)
 
@@ -83,46 +101,79 @@ def train(
         domain: Domain,
         **kwargs: Any,
     ) -> None:
-        """Trains the policy on given training trackers."""
+        """Trains the policy on given training trackers.
+
+        Args:
+            training_trackers:
+                the list of the :class:`rasa.core.trackers.DialogueStateTracker`
+            domain: the :class:`rasa.core.domain.Domain`
+        """
 
         raise NotImplementedError("Policy must have the capacity to train.")
 
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
     ) -> List[float]:
-        """Predicts the next action the bot should take
-        after seeing the tracker.
+        """Predicts the next action the bot should take after seeing the tracker.
 
-        Returns the list of probabilities for the next actions"""
+        Args:
+            tracker: the :class:`rasa.core.trackers.DialogueStateTracker`
+            domain: the :class:`rasa.core.domain.Domain`
+
+        Returns:
+             the list of probabilities for the next actions
+        """
 
         raise NotImplementedError("Policy must have the capacity to predict.")
 
     def persist(self, path: Text) -> None:
-        """Persists the policy to a storage."""
+        """Persists the policy to a storage.
+
+        Args:
+            path: the path where to save the policy to
+        """
+
         raise NotImplementedError("Policy must have the capacity to persist itself.")
 
     @classmethod
     def load(cls, path: Text) -> "Policy":
         """Loads a policy from the storage.
-            Needs to load its featurizer"""
+
+        Needs to load its featurizer.
+
+        Args:
+            path: the path from where to load the policy
+        """
+
         raise NotImplementedError("Policy must have the capacity to load itself.")
 
     @staticmethod
     def _default_predictions(domain: Domain) -> List[float]:
+        """Creates a list of zeros.
+
+        Args:
+            domain: the :class:`rasa.core.domain.Domain`
+        Returns:
+            the list of the length of the number of actions
+        """
+
         return [0.0] * domain.num_actions
 
 
-def confidence_scores_for(action_name, value, domain) -> List[float]:
+def confidence_scores_for(
+    action_name: Text, value: float, domain: Domain
+) -> List[float]:
     """Returns confidence scores if a single action is predicted.
 
     Args:
-        action_name: Name of action for which the score should be set.
-        value: Confidence for `action_name`.
-        domain: Domain which contains all actions.
-
-    Returns: List of length `len(nr_actions)`.
+        action_name: the name of the action for which the score should be set
+        value: the confidence for `action_name`
+        domain: the :class:`rasa.core.domain.Domain`
 
+    Returns:
+        the list of the length of the number of actions
     """
+
     results = [0.0] * domain.num_actions
     idx = domain.index_for_action(action_name)
     results[idx] = value
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 2d14da972d04..d9aafad0128d 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -35,7 +35,11 @@ def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
 
 
 def validate_requirements(component_names: List[Text]) -> None:
-    """Validates that all required importable python packages are installed."""
+    """Validates that all required importable python packages are installed
+
+    Args:
+        component_names: the list of component names
+    """
 
     from rasa.nlu import registry
 
@@ -62,8 +66,7 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
     """Validates that only one tokenizer is present in the pipeline.
 
     Args:
-        pipeline: the list of components in the pipeline
-
+        pipeline: the list of the :class:`rasa.nlu.components.Component`
     """
 
     from rasa.nlu.tokenizers.tokenizer import Tokenizer
@@ -84,8 +87,7 @@ def validate_required_components(pipeline: List["Component"]) -> None:
     """Validates that all required components are present in the pipeline.
 
     Args:
-        pipeline: the list of components in the pipeline
-
+        pipeline: the list of the :class:`rasa.nlu.components.Component`
     """
 
     unique_component_names = set()
@@ -106,7 +108,7 @@ def validate_arguments(
     """Validates that all arguments are present to train the pipeline.
 
     Args:
-        pipeline: the list of components in the pipeline
+        pipeline: the list of the :class:`rasa.nlu.components.Component`
         context: the component context
         allow_empty_pipeline: whether to allow an empty pipeline or not
 
@@ -174,7 +176,7 @@ def validate_required_components_from_data(
     """Validates that all components are present in the pipeline based on data.
 
     Args:
-        pipeline: the list of components in the pipeline
+        pipeline: the list of the :class:`rasa.nlu.components.Component`
         data: the :class:`rasa.nlu.training_data.training_data.TrainingData`
 
     """

From 92a90fa709c86a7776db3f480e4257f10aaa96ad Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 15:47:59 +0100
Subject: [PATCH 547/633] fix test

---
 tests/nlu/test_components.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/nlu/test_components.py b/tests/nlu/test_components.py
index d79b3ad8b83f..021ac82e830d 100644
--- a/tests/nlu/test_components.py
+++ b/tests/nlu/test_components.py
@@ -2,10 +2,7 @@
 
 from typing import Tuple
 from rasa.nlu import registry
-from rasa.nlu.components import (
-    find_unavailable_packages,
-    _required_component_in_pipeline,
-)
+from rasa.nlu.components import find_unavailable_packages
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from tests.nlu import utilities
@@ -39,11 +36,15 @@ def test_all_components_in_model_templates_exist(pipeline_template):
 def test_all_required_components_can_be_satisfied(component_class):
     """Checks that all required_components are present in the registry."""
 
+    def _required_component_in_registry(component):
+        for previous_component in registry.component_classes:
+            if issubclass(previous_component, component):
+                return True
+        return False
+
     missing_components = []
     for required_component in component_class.required_components():
-        if not _required_component_in_pipeline(
-            required_component, registry.component_classes
-        ):
+        if not _required_component_in_registry(required_component):
             missing_components.append(required_component.name)
 
     assert missing_components == [], (

From 6a12f025a1ff3a83f632f0a8d258cdeca65c931e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 16:44:05 +0100
Subject: [PATCH 548/633] review comments on docs

---
 changelog/5266.misc.rst                       |  2 +-
 .../config_crf_custom_features.yml            |  2 +-
 docs/api/tensorflow_usage.rst                 | 12 +--
 docs/core/policies.rst                        |  6 +-
 docs/migration-guide.rst                      | 25 +++---
 docs/nlu/choosing-a-pipeline.rst              | 66 ++++++++-------
 docs/nlu/components.rst                       | 80 ++++++++++---------
 7 files changed, 101 insertions(+), 92 deletions(-)

diff --git a/changelog/5266.misc.rst b/changelog/5266.misc.rst
index 0fc0fd323290..6aa2041f348b 100644
--- a/changelog/5266.misc.rst
+++ b/changelog/5266.misc.rst
@@ -1,4 +1,4 @@
-We deprecated all existing pipeline templates, ``SklearnIntentClassifier`` and ``KerasPolicy``.
+We deprecated all existing NLU pipeline templates, ``SklearnIntentClassifier`` and ``KerasPolicy``.
 
 Please list the components you want to use directly in your configuration file.
 Check out :ref:`Choosing a Pipeline <choosing-a-pipeline>` to decide what components to
diff --git a/data/test_config/config_crf_custom_features.yml b/data/test_config/config_crf_custom_features.yml
index 1301091a5a6c..70bdc09129e2 100644
--- a/data/test_config/config_crf_custom_features.yml
+++ b/data/test_config/config_crf_custom_features.yml
@@ -15,7 +15,7 @@ pipeline:
     # features for word before token
     - ["low", "title", "upper", "pos", "pos2"]
     # features of token itself
-    - ["low", "word3", "word2", "upper", "title", "digit", "pos", "pos2"]
+    - ["bias", "low", "word3", "word2", "upper", "title", "digit", "pos", "pos2", "pattern"]
     # features for word after the token we want to tag
     - ["low", "title", "upper", "pos", "pos2"]
   max_iterations: 50
diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
index e45b0fde3617..aa9d84023844 100644
--- a/docs/api/tensorflow_usage.rst
+++ b/docs/api/tensorflow_usage.rst
@@ -2,10 +2,10 @@
 
 .. _tensorflow_usage:
 
-Setting up the TensorFlow Runtime
-=================================
+TensorFlow Configuration
+========================
 
-TensorFlow allows setting the runtime environment via
+TensorFlow allows configuring options in the runtime environment via
 `TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa Open Source supports a smaller subset of these
 configuration options and makes appropriate calls to the ``tf.config`` submodule.
 This smaller subset comprises of configurations that developers frequently use with Rasa Open Source.
@@ -17,14 +17,14 @@ Optimizing CPU Performance
 Parallelizing One Operation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
+Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify the maximum number of threads that can be used
 to parallelize the execution of one operation. If left unspecified, this value defaults to ``0`` which means TensorFlow should
 pick an appropriate value depending on the system configuration.
 
 Parallelizing Multiple Operations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify maximum number of threads that can be used
+Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify the maximum number of threads that can be used
 to parallelize the execution of multiple **non-blocking** operations. If left unspecified, this value defaults to ``0``
 which means TensorFlow should pick an appropriate value depending on the system configuration.
 
@@ -36,7 +36,7 @@ Limiting GPU Memory Growth
 
 TensorFlow by default blocks all the available GPU memory for the running process. This can be limiting if you are running
 multiple TensorFlow processes and want to distribute memory across them. To prevent this,
-set an environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
+set the environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
 
 Restricting Absolute GPU Memory Available
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index a9073b34ee5e..1f176b80fc00 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -199,15 +199,15 @@ Embedding Policy
 
     .. warning::
 
-        ``EmbeddingPolicy`` got renamed to ``TEDPolicy``. Please use :ref:`ted_policy` instead of ``EmbeddingPolicy``.
-        The functionality of the policy stayed the same.
+        ``EmbeddingPolicy`` was renamed to ``TEDPolicy``. Please use :ref:`ted_policy` instead of ``EmbeddingPolicy``
+        in your policy configuration. The functionality of the policy stayed the same.
 
 .. _ted_policy:
 
 TED Policy
 ^^^^^^^^^^
 
-Transformer Embedding Dialogue (TED) Policy is described in
+The Transformer Embedding Dialogue (TED) Policy is described in
 `our paper <https://arxiv.org/abs/1910.00486>`__.
 
 This policy has a pre-defined architecture, which comprises the
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 6d86a9c01480..f0daa232efb9 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -39,7 +39,7 @@ General
 - All pre-defined pipeline templates are deprecated. Take a look at :ref:`choosing-a-pipeline`
   to decide on what components you should use in your configuration file.
 
-- The :ref:`embedding_policy` got renamed to :ref:`ted_policy`. The functionality of the policy stayed the same.
+- The :ref:`embedding_policy` was renamed to :ref:`ted_policy`. The functionality of the policy stayed the same.
   Please update your configuration files to use ``TEDPolicy`` instead of ``EmbeddingPolicy``.
 
 - Most of the model options for ``EmbeddingPolicy``, ``EmbeddingIntentClassifier``, and ``ResponseSelector`` got
@@ -69,30 +69,29 @@ General
   evaluate_on_num_examples       evaluate_on_number_of_examples
   =============================  =======================================================
 
-  A warning will be logged in case an old option is used. However, you can still use the old configuration options.
-  They will be mapped to the new names.
+  Old configuration options will be mapped to the new names, and a warning will be thrown.
+  However, these will be deprecated in a future release.
 
 - :ref:`embedding-intent-classifier` is now deprecated and will be replaced by :ref:`diet-classifier` in the future.
-  ``DIETClassfier`` is based on a multi-task architecture for intent classification and entity recognition.
-  However, if you want to get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use
+  ``DIETClassfier`` performs intent classification as well as entity recognition.
+  If you want to get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use
   the following configuration of ``DIETClassifier``:
 
   .. code-block:: yaml
 
     pipeline:
-    - ... # other components
+    # - ... other components
     - name: DIETClassifier
       intent_classification: True
       entity_recognition: False
       use_masked_language_model: False
       BILOU_flag: False
       number_of_transformer_layers: 0
-      ... # any other parameters
+      # ... any other parameters
 
   See :ref:`diet-classifier` for more information about the new component.
 
-- ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. ``DIETClassfier``
-  is based on a multi-task architecture for intent classification and entity recognition. However, if you want to
+- ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. If you want to
   get the same model behaviour as the current ``CRFEntityExtractor``, you can use the following configuration:
 
   .. code-block:: yaml
@@ -124,12 +123,10 @@ General
       number_of_transformer_layers: 0
       # ... any other parameters
 
-  As you can see in the configuration, you need to add the ``LexicalSyntacticFeaturizer`` before the ``DIETClassifier``
-  to your pipeline. ``CRFEntityExtractor`` featurizes user messages on its own, it does not depend on any featurizer.
-  We extracted the featurization from the component into the new featurizer ``LexicalSyntacticFeaturizer``. Thus,
+  ``CRFEntityExtractor`` featurizes user messages on its own, it does not depend on any featurizer.
+  We extracted the featurization from the component into the new featurizer :ref:``LexicalSyntacticFeaturizer``. Thus,
   in order to obtain the same results as before, you need to add this featurizer to your pipeline before the
-  ``DIETClassifier``. For more information about the ``DIETClassifier`` and the ``LexicalSyntacticFeaturizer``
-  see :ref:`components`.
+  :ref:``diet-classifier``.
 
 .. _migration-to-rasa-1.7:
 
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index bcd5bb0d4658..ec9f8bf8bd39 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -1,4 +1,4 @@
-:desc: Set up a pipeline of pre-trained components.
+:desc: Set up a pipeline of components.
 
 .. _choosing-a-pipeline:
 
@@ -14,19 +14,21 @@ it on your dataset.
    :local:
 
 .. warning::
-    We deprecated all existing pipeline templates, e.g. ``supervised_embeddings``, ``pretrained_embeddings_convert``
-    and ``pretrained_embeddings_spacy``. Please, list any components you want to use directly in the configuration
-    file.
+    We deprecated all existing pipeline templates, e.g.
+    :ref:`supervised_embeddings <section_supervised_embeddings_pipeline>`,
+    :ref:`pretrained_embeddings_spacy <section_pretrained_embeddings_spacy_pipeline>` and
+    :ref:`pretrained_embeddings_convert <section_pretrained_embeddings_convert_pipeline>`. Please, list any
+    components you want to use directly in the configuration file.
 
 The Short Answer
 ----------------
 
-If your training data is in english, a good starting point is the following pipeline:
+If your training data is in English, a good starting point is the following pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
     :language: yaml
 
-In case your training data is multi-lingual and is rich with domain specific vocabulary,
+In case your training data is in a different language than English and is rich with domain specific vocabulary,
 use the following pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/default_config.yml
@@ -56,7 +58,7 @@ Tokenization
 If your chosen language is whitespace-tokenized (words are separated by spaces), you
 can use the ``WhitespaceTokenizer``. If this is not the case you should use a different tokenizer.
 We support a number of different :ref:`tokenizers <tokenizers>`, or you can
-:ref:`create your own <custom-nlu-components>`.
+create your own :ref:`custom tokenizer <custom-nlu-components>`.
 
 .. note::
     Some components further down the pipeline may require a specific tokenizer. You can find those requirements
@@ -71,12 +73,12 @@ If you do not use any pre-trained word embeddings, your word vectors will be cus
 in general English, the word "balance" is closely related to "symmetry", but very different to the word "cash". In a
 banking domain, "balance" and "cash" are closely related and you'd like your model to capture that. If you don't
 use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language and domain.
-Thus, you should only use featurizers from the category `sparse` featuirzers, such as
+In those cases you should only use featurizers from the category `sparse` featurizers, such as
 ``CountVectorsFeaturizer``, ``RegexFeaturizer`` or ``LexicalSyntacticFeaturizer``.
 
 The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:
 "I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model already knows that the
-words "apples" and "pears" are very similar. This is especially useful if you don't have large enough training data.
+words "apples" and "pears" are very similar. This is especially useful if you don't have enough training data.
 We support a few components that provide pre-trained word embeddings:
 
 1. :ref:`MitieFeaturizer`
@@ -84,24 +86,26 @@ We support a few components that provide pre-trained word embeddings:
 3. :ref:`ConveRTFeaturizer`
 4. :ref:`LanguageModelFeaturizer`
 
-If your training data is in English, we recommend to use the ``ConveRTFeaturizer``.
+If your training data is in English, we recommend using the ``ConveRTFeaturizer``.
 The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence. For example, if you
 have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
 my place", since the contextual vector representation for both examples are already very similar, the intent classified
-for both is highly likely to be the same. This is also useful if you don't have large enough training data.
+for both is highly likely to be the same. This is also useful if you don't have enough training data.
 
-An alternative to ``ConveRTFeaturizer`` can be ``LanguageModelFeaturizer`` which uses pre-trained language models such
+An alternative to ``ConveRTFeaturizer`` is the ``LanguageModelFeaturizer`` which uses pre-trained language models such
 as BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See
 :ref:`HFTransformersNLP` for a full list of supported language models.
 
-In case, your training data is not in English you can also use a different variant of a language model which
-is pre-trained in the language specific to your training data. For example, there is a chinese language variant of
-BERT(``bert-base-chinese``) or a japanese variant of it(``bert-base-japanese``). A full list of different variants of
+If your training data is not in English you can also use a different variant of a language model which
+is pre-trained in the language specific to your training data.
+For example, there are chinese (``bert-base-chinese``) and japanese (``bert-base-japanese``) variants of the BERT model.
+A full list of different variants of
 these language models is available in the
-`official docs of Transformers library <https://huggingface.co/transformers/pretrained_models.html>_`.
+`official documentation of the transformers library <https://huggingface.co/transformers/pretrained_models.html>_`.
 
-``SpacyFeaturizer`` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`).
+``SpacyFeaturizer`` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`),
+so you can use this as another alternative, depending on the language of your training data.
 So, this featurizer can also be an alternate option depending on the language of your training data.
 
 Entity Recognition / Intent Classification / Response Selectors
@@ -149,15 +153,14 @@ Multiple Intents
 ----------------
 
 If you want to split intents into multiple labels, e.g. for predicting multiple intents or for modeling hierarchical
-intent structure, you need to use :ref:`diet-classifier` in your pipeline.
-To do this, use these flags in any tokenizer:
+intent structure, you need to use the :ref:`diet-classifier` in your pipeline.
+You'll also need to define these flags in whichever tokenizer you are using:
 
-    - ``intent_tokenization_flag``: indicates whether to tokenize intent labels or not. By default this flag is set to
-      ``False``, intent will not be tokenized.
-    - ``intent_split_symbol``: sets the delimiter string to split the intent labels. Default ``_``.
+    - ``intent_tokenization_flag``: Set it to ``True``, so that intent labels are tokenized.
+    - ``intent_split_symbol``: Set it to the delimiter string that splits the intent labels. Default ``_``.
 
-`Here <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__ is a
-tutorial on how to use multiple intents in Rasa.
+Read a `tutotiral <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__
+on how to use multiple intents in Rasa.
 
 Here's an example configuration:
 
@@ -277,6 +280,7 @@ exactly. Instead it will return the trained synonym.
     information and returns ``null``.
 
 
+
 Pipeline Templates (deprecated)
 -------------------------------
 
@@ -323,7 +327,7 @@ pretrained_embeddings_spacy
 The advantage of ``pretrained_embeddings_spacy`` pipeline is that if you have a training example like:
 "I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model
 already knows that the words "apples" and "pears" are very similar. This is especially useful
-if you don't have large enough training data.
+if you don't have enough training data.
 
 To use the ``pretrained_embeddings_spacy`` template, use the following configuration:
 
@@ -341,17 +345,17 @@ To use the components and configure them separately:
 pretrained_embeddings_convert
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    .. warning::
+    .. note::
         Since ``ConveRT`` model is trained only on an **English** corpus of conversations, this pipeline should only
         be used if your training data is in English language.
 
-This pipeline uses `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model to extract vector representation of
-a sentence and feeds them to ``EmbeddingIntentClassifier`` for intent classification.
-The advantage of using ``pretrained_embeddings_convert`` pipeline is that it doesn't treat each word of the user
+This pipeline uses the `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model to extract a vector representation of
+a sentence and feeds them to the ``EmbeddingIntentClassifier`` for intent classification.
+The advantage of using the ``pretrained_embeddings_convert`` pipeline is that it doesn't treat each word of the user
 message independently, but creates a contextual vector representation for the complete sentence. For example, if you
 have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
 my place", since the contextual vector representation for both examples are already very similar, the intent classified
-for both is highly likely to be the same. This is also useful if you don't have large enough training data.
+for both is highly likely to be the same. This is also useful if you don't have enough training data.
 
     .. note::
         To use ``pretrained_embeddings_convert`` pipeline, you should install Rasa with ``pip install rasa[convert]``.
@@ -379,7 +383,7 @@ but very different to the word "cash". In a banking domain, "balance" and "cash"
 and you'd like your model to capture that. This pipeline doesn't use a language-specific model,
 so it will work with any language that you can tokenize (on whitespace or using a custom tokenizer).
 
-You can read more about this topic `here <https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8>`__ .
+You can read more about this topic `in this blog post <https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8>`__ .
 
 To train a Rasa model in your preferred language, define the
 ``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 0936bfed0d15..3ad3f49b7e16 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -148,8 +148,8 @@ Tokenizers split text into tokens.
 If you want to split intents into multiple labels, e.g. for predicting multiple intents or for
 modeling hierarchical intent structure, use these flags with any tokenizer:
 
-- ``intent_tokenization_flag`` indicates whether to tokenize intent labels or not. By default this flag is set to
-  ``False``, intent will not be tokenized.
+- ``intent_tokenization_flag`` indicates whether to tokenize intent labels or not. Set it to ``True``, so that intent
+  labels are tokenized.
 - ``intent_split_symbol`` sets the delimiter string to split the intent labels, default is underscore
   (``_``).
 
@@ -160,13 +160,13 @@ WhitespaceTokenizer
 ~~~~~~~~~~~~~~~~~~~
 
 :Short: Tokenizer using whitespaces as a separator
-:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
 :Requires: Nothing
 :Description:
     Creates a token for every whitespace separated character sequence.
 :Configuration:
-    Make the tokenizer not case sensitive by adding the ``case_sensitive: False`` option.
-    Default being ``case_sensitive: True``.
+    Make the tokenizer case insensitive by adding the ``case_sensitive: False`` option, the
+    default being ``case_sensitive: True``.
 
     .. code-block:: yaml
 
@@ -184,12 +184,11 @@ JiebaTokenizer
 ~~~~~~~~~~~~~~
 
 :Short: Tokenizer using Jieba for Chinese language
-:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
 :Requires: Nothing
 :Description:
     Creates tokens using the Jieba tokenizer specifically for Chinese
-    language. For language other than Chinese, Jieba will work as
-    ``WhitespaceTokenizer``.
+    language. It will only work for the Chinese language.
 
     .. note::
         To use ``JiebaTokenizer`` you need to install Jieba with ``pip install jieba``.
@@ -213,7 +212,7 @@ MitieTokenizer
 ~~~~~~~~~~~~~~
 
 :Short: Tokenizer using MITIE
-:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
 :Requires: :ref:`MitieNLP`
 :Description: Creates tokens using the MITIE tokenizer.
 :Configuration:
@@ -231,7 +230,7 @@ SpacyTokenizer
 ~~~~~~~~~~~~~~
 
 :Short: Tokenizer using spaCy
-:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
 :Requires: :ref:`SpacyNLP`
 :Description:
     Creates tokens using the spaCy tokenizer.
@@ -252,13 +251,13 @@ ConveRTTokenizer
 ~~~~~~~~~~~~~~~~
 
 :Short: Tokenizer using `ConveRT <https://github.com/PolyAI-LDN/polyai-models#convert>`__ model.
-:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
 :Requires: Nothing
 :Description:
     Creates tokens using the ConveRT tokenizer. Must be used whenever the :ref:`ConveRTFeaturizer` is used.
 :Configuration:
-    Make the tokenizer not case sensitive by adding the ``case_sensitive: False`` option.
-    Default being ``case_sensitive: True``.
+    Make the tokenizer case insensitive by adding the ``case_sensitive: False`` option, the
+    default being ``case_sensitive: True``.
 
     .. code-block:: yaml
 
@@ -278,7 +277,7 @@ LanguageModelTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
 :Short: Tokenizer from pre-trained language models
-:Outputs: ``tokens`` for texts, responses (if present), and intents (if specified)
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
 :Requires: :ref:`HFTransformersNLP`
 :Description:
     Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
@@ -289,6 +288,10 @@ LanguageModelTokenizer
 
         pipeline:
         - name: "LanguageModelTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
 
 
 
@@ -318,7 +321,7 @@ MitieFeaturizer
 
 :Short:
     Creates a vector representation of user message and response (if specified) using the MITIE featurizer.
-:Outputs: ``dense_features`` for texts and responses
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`MitieNLP`
 :Type: Dense featurizer
 :Description:
@@ -350,7 +353,7 @@ SpacyFeaturizer
 
 :Short:
     Creates a vector representation of user message and response (if specified) using the spaCy featurizer.
-:Outputs: ``dense_features`` for texts and responses
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`SpacyNLP`
 :Type: Dense featurizer
 :Description:
@@ -378,7 +381,7 @@ ConveRTFeaturizer
 :Short:
     Creates a vector representation of user message and response (if specified) using
     `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`__ model.
-:Outputs: ``dense_features`` for texts and responses
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`ConveRTTokenizer`
 :Type: Dense featurizer
 :Description:
@@ -409,7 +412,7 @@ LanguageModelFeaturizer
 
 :Short:
     Creates a vector representation of user message and response (if specified) using a pre-trained language model.
-:Outputs: ``dense_features`` for texts and responses
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`HFTransformersNLP`
 :Type: Dense featurizer
 :Description:
@@ -423,8 +426,8 @@ LanguageModelFeaturizer
 
 :Configuration:
 
-    Include ``HFTransformersNLP`` component before this component. Also, use :ref:`LanguageModelTokenizer` to ensure
-    tokens are correctly set for all components throughout the pipeline.
+    Include ``HFTransformersNLP`` and :ref:`LanguageModelTokenizer` components before this component. Use
+    :ref:`LanguageModelTokenizer` to ensure tokens are correctly set for all components throughout the pipeline.
 
     .. code-block:: yaml
 
@@ -436,12 +439,12 @@ RegexFeaturizer
 ~~~~~~~~~~~~~~~
 
 :Short: Creates a vector representation of user message using regular expressions.
-:Outputs: ``sparse_features`` for texts and ``tokens.pattern``
+:Outputs: ``sparse_features`` for user messages and ``tokens.pattern``
 :Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
     Creates features for entity extraction and intent classification.
-    During training ``RegexFeaturizer`` creates a list of `regular expressions` defined in the training
+    During training the ``RegexFeaturizer`` creates a list of `regular expressions` defined in the training
     data format.
     For each regex, a feature will be set marking whether this expression was found in the input, which will later
     be fed into intent classifier / entity extractor to simplify classification (assuming the classifier has learned
@@ -460,7 +463,7 @@ CountVectorsFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
 :Short: Creates bag-of-words representation of user messages, intents, and responses.
-:Outputs: ``sparse_features`` for texts, intents, and responses
+:Outputs: ``sparse_features`` for user messages, intents, and responses
 :Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
@@ -572,14 +575,14 @@ CountVectorsFeaturizer
 LexicalSyntacticFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Creates lexical and syntactic features for user message to support entity extraction.
-:Outputs: ``sparse_features`` for texts
+:Short: Creates lexical and syntactic features for a user message to support entity extraction.
+:Outputs: ``sparse_features`` for user messages
 :Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
     Creates features for entity extraction.
     Moves with a sliding window over every token in the user message and creates features according to the
-    configuration (see below).
+    configuration (see below). As a default configuration is present, you don't need to specify a configuration.
 :Configuration:
     You need to configure what kind of lexical and syntactic features the featurizer should extract.
     The following features are available:
@@ -606,9 +609,9 @@ LexicalSyntacticFeaturizer
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
     previous tokens, the current token, and the next tokens in the sliding window.
-    You define the features as [before, token, after] array.
-    If you, for example, want to define features for the token before, the current token, and the token after,
-    your features configuration could look like this:
+    You define the features as a [before, token, after] array.
+    If you want to define features for the token before, the current token, and the token after,
+    your features configuration would look like this:
 
     .. code-block:: yaml
 
@@ -675,7 +678,7 @@ SklearnIntentClassifier
 
 :Short: Sklearn intent classifier
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: ``dense_features`` for user message
+:Requires: ``dense_features`` for user messages
 :Output-Example:
 
     .. code-block:: json
@@ -699,6 +702,10 @@ SklearnIntentClassifier
     rankings of the labels that did not "win". The ``SklearnIntentClassifier`` needs to be preceded by a dense
     featurizer in the pipeline. This dense featurizer creates the features used for the classification.
 
+    .. warning::
+        ``SklearnIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
+        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+
 :Configuration:
     During the training of the SVM a hyperparameter search is run to
     find the best parameter set. In the config, you can specify the parameters
@@ -721,9 +728,9 @@ SklearnIntentClassifier
 EmbeddingIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Dual Intent Entity Transformer used for intent classification
+:Short: Embedding intent classifier for intent classification
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
+:Requires: ``dense_features`` and/or ``sparse_features`` for user messages and intent (optional)
 :Output-Example:
 
     .. code-block:: json
@@ -960,7 +967,7 @@ ResponseSelector
 
 :Short: Response Selector
 :Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user message and response
+:Requires: ``dense_features`` and/or ``sparse_features`` for user messages and response
 
 :Output-Example:
 
@@ -1161,7 +1168,7 @@ SpacyEntityExtractor
         pipeline:
         - name: "SpacyEntityExtractor"
           # dimensions to extract
-          dimensions: None
+          dimensions: ["PERSON", "LOC", "ORG", "PRODUCT"]
 
 
 EntitySynonymMapper
@@ -1249,7 +1256,8 @@ CRFEntityExtractor
         :ref:`migration guide <migration-to-rasa-1.8>` for more details.
 
 :Configuration:
-    You need to configure what kind of features the CRF should use.
+    ``CRFEntityExtractor`` has a list of default features to use.
+    However, you can overwrite the default configuration.
     The following features are available:
 
     ===============  =============================================================================
@@ -1279,7 +1287,7 @@ CRFEntityExtractor
 
     Additional you can set a flag to determine whether to use the BILOU tagging schema or not.
 
-        - ``BILOU_flag`` determines whether to use BILOU tagging or not.
+        - ``BILOU_flag`` determines whether to use BILOU tagging or not. Default ``True``.
 
     .. code-block:: yaml
 

From 3c06032a76a43c95483751e9acde8cb35593b05c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 16:46:53 +0100
Subject: [PATCH 549/633] fix tests

---
 tests/nlu/classifiers/test_diet_classifier.py |  4 ++--
 tests/nlu/featurizers/test_featurizer.py      | 16 ++++++++++------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 074072e5ccbf..8b7eb94c9a57 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -144,8 +144,8 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
         )
 
     assert (
-        "Failed to validate component 'DIETClassifier'. Missing one of "
-        "the following properties: " in str(e.value)
+        "'DIETClassifier' requires ['Featurizer']. "
+        "Add required components to the pipeline." in str(e.value)
     )
 
 
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 9a640dd4b479..7561f603eebf 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -2,14 +2,18 @@
 import pytest
 import scipy.sparse
 
-from rasa.nlu.featurizers.featurizer import Featurizer, sequence_to_sentence_features
+from rasa.nlu.featurizers.featurizer import (
+    SparseFeaturizer,
+    DenseFeaturizer,
+    sequence_to_sentence_features,
+)
 from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT
 from rasa.nlu.training_data import Message
 
 
 def test_combine_with_existing_dense_features():
 
-    featurizer = Featurizer()
+    featurizer = DenseFeaturizer()
     attribute = DENSE_FEATURE_NAMES[TEXT]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
@@ -27,7 +31,7 @@ def test_combine_with_existing_dense_features():
 
 
 def test_combine_with_existing_dense_features_shape_mismatch():
-    featurizer = Featurizer()
+    featurizer = DenseFeaturizer()
     attribute = DENSE_FEATURE_NAMES[TEXT]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
@@ -43,7 +47,7 @@ def test_combine_with_existing_dense_features_shape_mismatch():
 
 
 def test_combine_with_existing_sparse_features():
-    featurizer = Featurizer()
+    featurizer = SparseFeaturizer()
     attribute = SPARSE_FEATURE_NAMES[TEXT]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
@@ -62,7 +66,7 @@ def test_combine_with_existing_sparse_features():
 
 
 def test_combine_with_existing_sparse_features_shape_mismatch():
-    featurizer = Featurizer()
+    featurizer = SparseFeaturizer()
     attribute = SPARSE_FEATURE_NAMES[TEXT]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
@@ -122,6 +126,6 @@ def test_sequence_to_sentence_features(features, expected):
     ],
 )
 def test_calculate_cls_vector(pooling, features, expected):
-    actual = Featurizer._calculate_cls_vector(features, pooling)
+    actual = DenseFeaturizer._calculate_cls_vector(features, pooling)
 
     assert np.all(actual == expected)

From bfcde0d7a7d57521091824703b4e4142eab92383 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:48:17 +0100
Subject: [PATCH 550/633] Update data/configs_for_docs/default_config.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/default_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
index a9671b2a8c65..51e95f8adccb 100644
--- a/data/configs_for_docs/default_config.yml
+++ b/data/configs_for_docs/default_config.yml
@@ -32,4 +32,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: ResponseSelector
\ No newline at end of file
+  - name: ResponseSelector

From ac8c2ec8a49846e75fa361f9d3a8901a78130f0f Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:48:33 +0100
Subject: [PATCH 551/633] Update
 data/configs_for_docs/default_english_config.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/default_english_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/default_english_config.yml b/data/configs_for_docs/default_english_config.yml
index 366e2bc9aac9..ee0da9bfab1d 100644
--- a/data/configs_for_docs/default_english_config.yml
+++ b/data/configs_for_docs/default_english_config.yml
@@ -12,4 +12,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: ResponseSelector
\ No newline at end of file
+  - name: ResponseSelector

From 8d0c3dd0975ed822cf1b7f36f9d7102318b7ff32 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:48:54 +0100
Subject: [PATCH 552/633] Update data/configs_for_docs/default_spacy_config.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/default_spacy_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/default_spacy_config.yml b/data/configs_for_docs/default_spacy_config.yml
index 46b75c8078c7..4834403621ee 100644
--- a/data/configs_for_docs/default_spacy_config.yml
+++ b/data/configs_for_docs/default_spacy_config.yml
@@ -11,4 +11,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: ResponseSelector
\ No newline at end of file
+  - name: ResponseSelector

From 877e52fff7d08ea9dd9e866b35c54d16ba0ed819 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:49:13 +0100
Subject: [PATCH 553/633] Update
 data/configs_for_docs/pretrained_embeddings_convert_config_1.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 .../configs_for_docs/pretrained_embeddings_convert_config_1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml b/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
index f64e16fdded5..992e56a5e186 100644
--- a/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
+++ b/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
@@ -1,3 +1,3 @@
 language: "en"
 
-pipeline: "pretrained_embeddings_convert"
\ No newline at end of file
+pipeline: "pretrained_embeddings_convert"

From 48740957201ede012dc618f0098bcf954931520d Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:49:42 +0100
Subject: [PATCH 554/633] Update
 data/configs_for_docs/pretrained_embeddings_convert_config_2.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 .../configs_for_docs/pretrained_embeddings_convert_config_2.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml b/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
index 8f25d3a60eae..58d393dc5d36 100644
--- a/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
+++ b/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
@@ -3,4 +3,4 @@ language: "en"
 pipeline:
 - name: "ConveRTTokenizer"
 - name: "ConveRTFeaturizer"
-- name: "EmbeddingIntentClassifier"
\ No newline at end of file
+- name: "EmbeddingIntentClassifier"

From 0543f7a105a33a6b9641354b42ea6192ba019ca3 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:50:07 +0100
Subject: [PATCH 555/633] Update
 data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml b/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
index 33a92d02a4af..3516519cd529 100644
--- a/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
+++ b/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
@@ -1,3 +1,3 @@
 language: "en"
 
-pipeline: "pretrained_embeddings_spacy"
\ No newline at end of file
+pipeline: "pretrained_embeddings_spacy"

From 628b07a3b4ae76329aa4c10e912302c10554c60f Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:50:27 +0100
Subject: [PATCH 556/633] Update
 data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml b/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
index a87aad8163b4..14aca60c5a69 100644
--- a/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
+++ b/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
@@ -7,4 +7,5 @@ pipeline:
 - name: "RegexFeaturizer"
 - name: "CRFEntityExtractor"
 - name: "EntitySynonymMapper"
-- name: "SklearnIntentClassifier"
\ No newline at end of file
+
+- name: "SklearnIntentClassifier"

From d9f06c8fb2cd7a5f2320d09e0cc6c06b44db45f9 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:50:47 +0100
Subject: [PATCH 557/633] Update
 data/configs_for_docs/supervised_embeddings_config_1.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/supervised_embeddings_config_1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/supervised_embeddings_config_1.yml b/data/configs_for_docs/supervised_embeddings_config_1.yml
index 8ef74b4bcf93..3d965f6147d4 100644
--- a/data/configs_for_docs/supervised_embeddings_config_1.yml
+++ b/data/configs_for_docs/supervised_embeddings_config_1.yml
@@ -1,3 +1,3 @@
 language: "en"
 
-pipeline: "supervised_embeddings"
\ No newline at end of file
+pipeline: "supervised_embeddings"

From 5ae181de2ba05bb5e2146dfe96da34fa5d4f0dc1 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:51:08 +0100
Subject: [PATCH 558/633] Update
 data/configs_for_docs/supervised_embeddings_config_2.yml

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 data/configs_for_docs/supervised_embeddings_config_2.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/configs_for_docs/supervised_embeddings_config_2.yml b/data/configs_for_docs/supervised_embeddings_config_2.yml
index 2bcf54835cc2..c1a776269dae 100644
--- a/data/configs_for_docs/supervised_embeddings_config_2.yml
+++ b/data/configs_for_docs/supervised_embeddings_config_2.yml
@@ -10,4 +10,4 @@ pipeline:
   analyzer: "char_wb"
   min_ngram: 1
   max_ngram: 4
-- name: "EmbeddingIntentClassifier"
\ No newline at end of file
+- name: "EmbeddingIntentClassifier"

From 36a5be9df604d9d112601a0d08ed5585ab2b704b Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 16:55:41 +0100
Subject: [PATCH 559/633] Update rasa/core/policies/keras_policy.py

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/core/policies/keras_policy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 23717965241e..0763f622bee7 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -164,7 +164,6 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        # set random seed
         np.random.seed(self.random_seed)
         tf.random.set_seed(self.random_seed)
 

From c4d1eefd061f97d4583b2ec53d5667f13d117c04 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 16:58:14 +0100
Subject: [PATCH 560/633] create DOCS_URL_MIGRATION_GUIDE

---
 rasa/constants.py                  | 1 +
 rasa/core/policies/keras_policy.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/rasa/constants.py b/rasa/constants.py
index 76729318a61a..c756eb82a1bb 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -37,6 +37,7 @@
 DOCS_URL_COMPONENTS = DOCS_BASE_URL + "/nlu/components/"
 DOCS_URL_TRAINING_DATA_NLU = DOCS_BASE_URL + "/nlu/training-data-format/"
 DOCS_URL_MIGRATE_GOOGLE = DOCS_BASE_URL + "/migrate-from/google-dialogflow-to-rasa/"
+DOCS_URL_MIGRATION_GUIDE = DOCS_BASE_URL + "/migration-guide/"
 
 LEGACY_DOCS_BASE_URL = "http://legacy-docs.rasa.com"
 
diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 0763f622bee7..61d60c480949 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -19,6 +19,7 @@
 from rasa.core.trackers import DialogueStateTracker
 import rasa.utils.common as common_utils
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
 
 
 # there are a number of issues with imports from tensorflow. hence the deactivation
@@ -76,7 +77,7 @@ def __init__(
             "'KerasPolicy' is deprecated and will be removed in version "
             "2.0. Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:

From 1d7aeada6ff5671ed05a12369e110cae368fe23c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 17:08:25 +0100
Subject: [PATCH 561/633] update choosing a pipeline.

---
 data/configs_for_docs/default_config.yml | 19 -------------
 docs/nlu/choosing-a-pipeline.rst         | 34 +++++++++++++++++++-----
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
index a9671b2a8c65..3e8562b349bf 100644
--- a/data/configs_for_docs/default_config.yml
+++ b/data/configs_for_docs/default_config.yml
@@ -6,25 +6,6 @@ pipeline:
   - name: SpacyFeaturizer
   - name: RegexFeaturizer
   - name: LexicalSyntacticFeaturizer
-    "features": [
-      ["low", "title", "upper"],
-      [
-        "BOS",
-        "EOS",
-        "low",
-        "prefix5",
-        "prefix2",
-        "suffix5",
-        "suffix3",
-        "suffix2",
-        "upper",
-        "title",
-        "digit",
-        "pos",
-        "pos2"
-      ],
-      ["low", "title", "upper"],
-    ]
   - name: CountVectorsFeaturizer
   - name: CountVectorsFeaturizer
     analyzer: "char_wb"
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index ec9f8bf8bd39..7edb50c98e9f 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -28,8 +28,7 @@ If your training data is in English, a good starting point is the following pipe
 .. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
     :language: yaml
 
-In case your training data is in a different language than English and is rich with domain specific vocabulary,
-use the following pipeline:
+In case your training data is in a different language than English, use the following pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/default_config.yml
     :language: yaml
@@ -38,14 +37,35 @@ use the following pipeline:
 A Longer Answer
 ---------------
 
-We encourage everyone to define their own pipeline by listing the names of the components you want to use.
-For example:
+We recommend to use the following pipeline, if your training data is in English:
 
-.. literalinclude:: ../../data/configs_for_docs/default_spacy_config.yml
+.. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
+    :language: yaml
+
+The pipeline contains the :ref:`ConveRTFeaturizer` that provides pre-trained word embeddings of the user utterance.
+Pre-trained word embeddings are helpful as they already encode some kind of linguistic knowledge.
+For example, if you have a sentence like "I want to buy apples" in your training data, and Rasa is asked to predict
+the intent for "get pears", your model already knows that the words "apples" and "pears" are very similar.
+This is especially useful if you don’t have enough training data.
+The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
+creates a contextual vector representation for the complete sentence.
+However, ``ConveRT`` is only available in English.
+If your training data is not in English, we recommend to use the following pipeline:
+
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
     :language: yaml
 
-You can find the details of each component in :ref:`components`.
-If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
+It uses the :ref:`SpacyFeaturizer` instead of the :ref:`ConveRTFeaturizer`.
+``SpacyFeaturizer`` provides pre-trained word embeddings in many different languages
+(see :ref:`pretrained-word-vectors`).
+
+.. note::
+    We encourage everyone to define their own pipeline by listing the names of the components you want to use.
+    You can find the details of each component in :ref:`components`.
+    If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
+
+Choosing the right Components
+-----------------------------
 
 A pipeline usually consist of three main parts:
 

From 350bc8034627c3f916222eab0f5da6e8509ea018 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 17:13:25 +0100
Subject: [PATCH 562/633] undo changes

---
 .../nlu/utils/hugging_face/hf_transformers.py | 21 ++++++-------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index fc79433d5d8b..78e0e0893f61 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -78,21 +78,12 @@ def _load_model(self) -> None:
 
         logger.debug(f"Loading Tokenizer and Model for {self.model_name}")
 
-        try:
-            self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
-                self.model_weights
-            )
-            self.model = model_class_dict[self.model_name].from_pretrained(
-                self.model_weights
-            )
-        except OSError:
-            # if loading of models weights fail, try to download them again
-            self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
-                self.model_weights, force_download=True
-            )
-            self.model = model_class_dict[self.model_name].from_pretrained(
-                self.model_weights, force_download=True
-            )
+        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
+            self.model_weights
+        )
+        self.model = model_class_dict[self.model_name].from_pretrained(
+            self.model_weights
+        )
 
         # Use a universal pad token since all transformer architectures do not have a
         # consistent token. Instead of pad_token_id we use unk_token_id because

From aa7171bbd75e326cfe5b841888dae64beffc86d5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 17:20:31 +0100
Subject: [PATCH 563/633] refactor config checks

---
 rasa/nlu/classifiers/diet_classifier.py | 27 ++++++++++++++++---------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 59240fc4e400..b9ea51c20e01 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -196,11 +196,7 @@ class DIETClassifier(EntityExtractor):
     }
 
     # init helpers
-    def _check_config_parameters(self) -> None:
-        self.component_config = train_utils.check_deprecated_options(
-            self.component_config
-        )
-
+    def _check_masked_lm(self) -> None:
         if (
             self.component_config[MASKED_LM]
             and self.component_config[NUM_TRANSFORMER_LAYERS] == 0
@@ -210,21 +206,32 @@ def _check_config_parameters(self) -> None:
                 f"'{MASKED_LM}' option should be 'False'."
             )
 
+    def _check_share_hidden_layers_sizes(self) -> None:
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
-            first_hidden_layer_size = next(
+            first_hidden_layer_sizes = next(
                 iter(self.component_config[HIDDEN_LAYERS_SIZES].values())
             )
-            if any(
-                current_hidden_layer_size != first_hidden_layer_size
-                for current_hidden_layer_size in self.component_config[
+            # check that all hidden layer sizes are the same
+            identical_hidden_layer_sizes = all(
+                current_hidden_layer_sizes == first_hidden_layer_sizes
+                for current_hidden_layer_sizes in self.component_config[
                     HIDDEN_LAYERS_SIZES
                 ].values()
-            ):
+            )
+            if not identical_hidden_layer_sizes:
                 raise ValueError(
                     f"If hidden layer weights are shared, "
                     f"{HIDDEN_LAYERS_SIZES} must coincide."
                 )
 
+    def _check_config_parameters(self) -> None:
+        self.component_config = train_utils.check_deprecated_options(
+            self.component_config
+        )
+
+        self._check_masked_lm()
+        self._check_share_hidden_layers_sizes()
+
         self.component_config = train_utils.update_similarity_type(
             self.component_config
         )

From ecf939bff15e20ff69f0c4c5b0c355e2a41b6bcf Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 17:52:16 +0100
Subject: [PATCH 564/633] create removal changelog

---
 changelog/5266.misc.rst    | 5 +----
 changelog/5266.removal.rst | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)
 create mode 100644 changelog/5266.removal.rst

diff --git a/changelog/5266.misc.rst b/changelog/5266.misc.rst
index 70c4b8221986..41f4875eb45f 100644
--- a/changelog/5266.misc.rst
+++ b/changelog/5266.misc.rst
@@ -6,7 +6,4 @@ include in your pipeline.
 
 Use ``DIETClassifier`` instead of ``SklearnIntentClassifier``.
 
-Use ``TEDPolicy`` instead of ``KerasPolicy``.
-
-Properties ``Component.provides`` and ``Component.requires`` are deprecated.
-Use ``Component.required_components()`` instead.
\ No newline at end of file
+Use ``TEDPolicy`` instead of ``KerasPolicy``.
\ No newline at end of file
diff --git a/changelog/5266.removal.rst b/changelog/5266.removal.rst
new file mode 100644
index 000000000000..f8b4270bbcf8
--- /dev/null
+++ b/changelog/5266.removal.rst
@@ -0,0 +1,2 @@
+Properties ``Component.provides`` and ``Component.requires`` are deprecated.
+Use ``Component.required_components()`` instead.
\ No newline at end of file

From 1e77c693f69102159b7b90c83f863c7ad4614aa3 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 17:59:33 +0100
Subject: [PATCH 565/633] Update rasa/nlu/classifiers/diet_classifier.py

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 83b2dfe63d46..b7c08f75d9b8 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -266,7 +266,7 @@ def __init__(
         self.model = model
 
         # encode all label_ids with numbers
-        self._label_data = None  # RasaModelData
+        self._label_data: Optional[RasaModelData] = None
 
         # keep the input tuple sizes in self.batch_in
         self.batch_tuple_sizes = batch_tuple_sizes

From 18f7d9afc6a0617a1f51e0d21915dee63df38eed Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 18:00:01 +0100
Subject: [PATCH 566/633] Update rasa/nlu/classifiers/diet_classifier.py

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b7c08f75d9b8..28c6ed41e16d 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -274,7 +274,7 @@ def __init__(
         # number of entity tags
         self.num_tags = 0
 
-        self.data_example = None  # Dict[Text, List[np.ndarray]]
+        self.data_example: Dict[Text, List[np.ndarray]] = None
 
     @property
     def label_key(self) -> Optional[Text]:

From db0d794fdb033652cd45611bd67dc44abe0aeddf Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 18:02:36 +0100
Subject: [PATCH 567/633] set num_tags to None in init

---
 rasa/nlu/classifiers/diet_classifier.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 28c6ed41e16d..1dada2d27c19 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -265,16 +265,16 @@ def __init__(
 
         self.model = model
 
-        # encode all label_ids with numbers
-        self._label_data: Optional[RasaModelData] = None
-
         # keep the input tuple sizes in self.batch_in
         self.batch_tuple_sizes = batch_tuple_sizes
 
+        # encode all label_ids with numbers
+        self._label_data: Optional[RasaModelData] = None
+
         # number of entity tags
-        self.num_tags = 0
+        self.num_tags: Optional[int] = None
 
-        self.data_example: Dict[Text, List[np.ndarray]] = None
+        self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
     @property
     def label_key(self) -> Optional[Text]:

From b02135c2f3bf0c005af40e69d7fcbdb3cb5a6d57 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 24 Feb 2020 18:14:09 +0100
Subject: [PATCH 568/633] substitute Any with Type[...]

---
 rasa/nlu/classifiers/diet_classifier.py                   | 7 ++++---
 rasa/nlu/classifiers/embedding_intent_classifier.py       | 5 +++--
 rasa/nlu/classifiers/mitie_intent_classifier.py           | 5 +++--
 rasa/nlu/classifiers/sklearn_intent_classifier.py         | 5 +++--
 rasa/nlu/components.py                                    | 6 +++---
 rasa/nlu/extractors/crf_entity_extractor.py               | 5 +++--
 rasa/nlu/extractors/mitie_entity_extractor.py             | 5 +++--
 rasa/nlu/extractors/spacy_entity_extractor.py             | 5 +++--
 .../featurizers/dense_featurizer/convert_featurizer.py    | 5 +++--
 rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py    | 5 +++--
 rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py | 5 +++--
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py | 5 +++--
 .../sparse_featurizer/count_vectors_featurizer.py         | 5 +++--
 .../sparse_featurizer/lexical_syntactic_featurizer.py     | 5 +++--
 .../nlu/featurizers/sparse_featurizer/regex_featurizer.py | 5 +++--
 rasa/nlu/selectors/response_selector.py                   | 7 ++++---
 rasa/nlu/tokenizers/lm_tokenizer.py                       | 8 +++-----
 rasa/nlu/tokenizers/spacy_tokenizer.py                    | 7 ++++---
 tests/nlu/example_component.py                            | 2 +-
 19 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 1dada2d27c19..b52524b43444 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -8,11 +8,12 @@
 import tensorflow as tf
 import tensorflow_addons as tfa
 
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, Type
 
 import rasa.utils.io as io_utils
 import rasa.nlu.utils.bilou_utils as bilou_utils
 from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.components import Component
 from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.test import determine_token_labels
@@ -93,7 +94,7 @@ class DIETClassifier(IntentClassifier, EntityExtractor):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Featurizer]
 
     # please make sure to update the docs when changing a default parameter
@@ -281,7 +282,7 @@ def label_key(self) -> Optional[Text]:
         return "label_ids" if self.component_config[INTENT_CLASSIFICATION] else None
 
     @staticmethod
-    def model_class() -> Any:
+    def model_class() -> Type[RasaModel]:
         return DIET
 
     # training data helpers:
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 31172a62194a..d36adf590e69 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,7 +1,8 @@
 import logging
-from typing import Any, Dict, Optional, Text, List
+from typing import Any, Dict, Optional, Text, List, Type
 
 from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.components import Component
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.constants import TEXT
 from rasa.utils.tensorflow.constants import (
@@ -55,7 +56,7 @@ class EmbeddingIntentClassifier(DIETClassifier):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Featurizer]
 
     # please make sure to update the docs when changing a default parameter
diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index ba3f7e727aa6..5a971ab18b9a 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -1,10 +1,11 @@
 import os
 import typing
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Type
 
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
 from rasa.nlu.classifiers.classifier import IntentClassifier
+from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from rasa.nlu.constants import TOKENS_NAMES, TEXT, INTENT
@@ -16,7 +17,7 @@
 
 class MitieIntentClassifier(IntentClassifier):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [MitieNLP, Tokenizer]
 
     def __init__(
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 2cb027141efb..efa158f65441 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -2,7 +2,7 @@
 import os
 import typing
 import warnings
-from typing import Any, Dict, List, Optional, Text, Tuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Type
 
 import numpy as np
 
@@ -10,6 +10,7 @@
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.components import Component
 from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT
@@ -28,7 +29,7 @@ class SklearnIntentClassifier(IntentClassifier):
     """Intent classifier using the sklearn framework"""
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [DenseFeaturizer]
 
     defaults = {
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 8db499764169..c664e857d9da 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -1,6 +1,6 @@
 import logging
 import typing
-from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
+from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple, Type
 
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults, InvalidConfigError
 from rasa.nlu.training_data import Message, TrainingData
@@ -100,7 +100,7 @@ def validate_tokenizers(pipeline: List["Component"]) -> None:
 
 
 def _required_component_in_pipeline(
-    required_component: Any, pipeline: List["Component"]
+    required_component: Type["Component"], pipeline: List["Component"]
 ) -> bool:
     """Checks that required component present in the pipeline.
 
@@ -283,7 +283,7 @@ def name(self):
     # Which components are required by this component.
     # Listed components should appear before the component itself in the pipeline.
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type["Component"]]:
         """Specify which components need to be present in the pipeline."""
 
         return []
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 851dab60da78..780a3a47960b 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -2,12 +2,13 @@
 import os
 import typing
 import numpy as np
-from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple, Type
 
 import rasa.nlu.utils.bilou_utils as bilou_utils
 import rasa.utils.common as common_utils
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.components import Component
 from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -31,7 +32,7 @@ class CRFToken(NamedTuple):
 
 class CRFEntityExtractor(EntityExtractor):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Tokenizer]
 
     defaults = {
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index b78bcdad79cd..22dd94eb9139 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -1,12 +1,13 @@
 import logging
 import os
 import typing
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Type
 
 from rasa.nlu.constants import ENTITIES, TOKENS_NAMES, TEXT
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.components import Component
 from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
@@ -20,7 +21,7 @@
 
 class MitieEntityExtractor(EntityExtractor):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [MitieNLP, Tokenizer]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None, ner=None):
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
index 53be4f389eaa..749445acb887 100644
--- a/rasa/nlu/extractors/spacy_entity_extractor.py
+++ b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -1,8 +1,9 @@
 import typing
-from typing import Any, Dict, List, Text, Optional
+from typing import Any, Dict, List, Text, Optional, Type
 
 from rasa.nlu.constants import ENTITIES
 from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.components import Component
 from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.training_data import Message
 
@@ -12,7 +13,7 @@
 
 class SpacyEntityExtractor(EntityExtractor):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [SpacyNLP]
 
     defaults = {
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 98d6ba393d11..e3c9a9d017ef 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,9 +1,10 @@
 import logging
-from typing import Any, Dict, List, Optional, Text, Tuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Type
 from tqdm import tqdm
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
@@ -32,7 +33,7 @@ class ConveRTFeaturizer(DenseFeaturizer):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [ConveRTTokenizer]
 
     def _load_from_tfhub(self, model_url: Text):
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 4ca561d5e66f..5e32ecd84f47 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -1,7 +1,8 @@
 import numpy as np
-from typing import Any, Optional, Text, List
+from typing import Any, Optional, Text, List, Type
 
 from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
@@ -24,7 +25,7 @@ class LanguageModelFeaturizer(DenseFeaturizer):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [HFTransformersNLP, LanguageModelTokenizer]
 
     def train(
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 2f743ee1b8eb..0a0af6ef73a7 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,8 +1,9 @@
 import numpy as np
 import typing
-from typing import Any, List, Text, Optional, Dict
+from typing import Any, List, Text, Optional, Dict, Type
 
 from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
@@ -20,7 +21,7 @@
 
 class MitieFeaturizer(DenseFeaturizer):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [MitieNLP, Tokenizer]
 
     defaults = {
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 0f2ca6ade5e3..5b441301fb99 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -1,8 +1,9 @@
 import numpy as np
 import typing
-from typing import Any, Optional, Text, Dict, List
+from typing import Any, Optional, Text, Dict, List, Type
 
 from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -20,7 +21,7 @@
 
 class SpacyFeaturizer(DenseFeaturizer):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [SpacyNLP, SpacyTokenizer]
 
     defaults = {
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 7507113cc437..cf5d33dfd4e0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -2,7 +2,7 @@
 import os
 import re
 import scipy.sparse
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Type
 
 from rasa.constants import DOCS_URL_COMPONENTS
 import rasa.utils.common as common_utils
@@ -10,6 +10,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
@@ -38,7 +39,7 @@ class CountVectorsFeaturizer(SparseFeaturizer):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Tokenizer]
 
     defaults = {
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 59dc25897d2c..d67fb7a98871 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -4,9 +4,10 @@
 
 import numpy as np
 import scipy.sparse
-from typing import Any, Dict, Optional, Text, List
+from typing import Any, Dict, Optional, Text, List, Type
 
 from rasa.constants import DOCS_URL_COMPONENTS
+from rasa.nlu.components import Component
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
 from rasa.nlu.featurizers.featurizer import SparseFeaturizer
@@ -21,7 +22,7 @@
 
 class LexicalSyntacticFeaturizer(SparseFeaturizer):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Tokenizer]
 
     defaults = {
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 9bec70681f52..c1af343d64e0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import re
-from typing import Any, Dict, List, Optional, Text, Union
+from typing import Any, Dict, List, Optional, Text, Union, Type
 
 import numpy as np
 
@@ -19,6 +19,7 @@
     TOKENS_NAMES,
 )
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.training_data import Message, TrainingData
 import rasa.utils.common as common_utils
@@ -29,7 +30,7 @@
 
 class RegexFeaturizer(SparseFeaturizer):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Tokenizer]
 
     def __init__(
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 7d4593ab05c7..1910f7a2148e 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -3,9 +3,10 @@
 import numpy as np
 import tensorflow as tf
 
-from typing import Any, Dict, Optional, Text, Tuple, Union, List
+from typing import Any, Dict, Optional, Text, Tuple, Union, List, Type
 
 from rasa.nlu.training_data import TrainingData, Message
+from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
 from rasa.utils.tensorflow.constants import (
@@ -80,7 +81,7 @@ class ResponseSelector(DIETClassifier):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [Featurizer]
 
     defaults = {
@@ -204,7 +205,7 @@ def label_key(self) -> Text:
         return "label_ids"
 
     @staticmethod
-    def model_class():
+    def model_class() -> Type[RasaModel]:
         return DIET2DIET
 
     def _load_selector_params(self, config: Dict[Text, Any]) -> None:
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
index 0c6ca9ee215e..56ac683ddf60 100644
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -1,14 +1,12 @@
-from typing import Text, List, Any, Dict
+from typing import Text, List, Any, Dict, Type
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.components import Component
 from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.nlu.training_data import Message
 
 from rasa.nlu.constants import (
-    TOKENS_NAMES,
     LANGUAGE_MODEL_DOCS,
-    DENSE_FEATURIZABLE_ATTRIBUTES,
-    MESSAGE_ATTRIBUTES,
     TOKENS,
 )
 
@@ -21,7 +19,7 @@ class LanguageModelTokenizer(Tokenizer):
     """
 
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [HFTransformersNLP]
 
     defaults = {
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 65a8788abb20..76aa75be00c4 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,11 +1,12 @@
 import typing
-from typing import Text, List, Any
+from typing import Text, List, Any, Type
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.components import Component
 from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.training_data import Message
 
-from rasa.nlu.constants import TOKENS_NAMES, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import SPACY_DOCS
 
 if typing.TYPE_CHECKING:
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
@@ -18,7 +19,7 @@
 
 class SpacyTokenizer(Tokenizer):
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         return [SpacyNLP]
 
     defaults = {
diff --git a/tests/nlu/example_component.py b/tests/nlu/example_component.py
index 3fb3a6a4e40f..7b11e7d53a75 100644
--- a/tests/nlu/example_component.py
+++ b/tests/nlu/example_component.py
@@ -15,7 +15,7 @@ class MyComponent(Component):
     # Which components are required by this component.
     # Listed components should appear before the component itself in the pipeline.
     @classmethod
-    def required_components(cls) -> List[Any]:
+    def required_components(cls) -> List[Type[Component]]:
         """Specify which components need to be present in the pipeline."""
 
         return []

From 1831f46766b6992d81a24f2a8808f85c227dd521 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 21:09:56 +0100
Subject: [PATCH 569/633] Update rasa/nlu/classifiers/diet_classifier.py

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b52524b43444..40d1f26f8ead 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -290,7 +290,7 @@ def model_class() -> Type[RasaModel]:
     def _create_label_id_dict(
         training_data: TrainingData, attribute: Text
     ) -> Dict[Text, int]:
-        """Create label_id dictionary"""
+        """Create label_id dictionary."""
 
         distinct_label_ids = {
             example.get(attribute) for example in training_data.intent_examples

From 872859923d741639a1c3092490cd1554708cb561 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 24 Feb 2020 21:13:19 +0100
Subject: [PATCH 570/633] Update rasa/nlu/classifiers/diet_classifier.py

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 40d1f26f8ead..2f4ac2c03242 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -386,7 +386,7 @@ def _extract_features(
 
         return sparse_features, dense_features
 
-    def check_input_dimension_consistency(self, model_data: RasaModelData):
+    def check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
         """Checks if text features and label features have the same dimensionality if
         hidden layers are shared."""
         if self.component_config.get(SHARE_HIDDEN_LAYERS):

From 3915a0b9d2e7bfbaa4b974687fc47e19d93b5e44 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 22:05:10 +0100
Subject: [PATCH 571/633] add missing import

---
 tests/nlu/example_component.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/example_component.py b/tests/nlu/example_component.py
index 7b11e7d53a75..67a47e477b88 100644
--- a/tests/nlu/example_component.py
+++ b/tests/nlu/example_component.py
@@ -1,5 +1,5 @@
 import typing
-from typing import Any, Optional, Text, Dict, List
+from typing import Any, Optional, Text, Dict, List, Type
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig

From 84bced61f72ce73203782197fc2a2ad1a3e37f1c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 24 Feb 2020 22:32:01 +0100
Subject: [PATCH 572/633] fix types

---
 rasa/nlu/classifiers/diet_classifier.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c08720ec1253..5eb69682977b 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -639,10 +639,10 @@ def train(
         self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
 
         self.model = self.model_class()(
-            model_data.get_signature(),
-            self._label_data,
-            self.inverted_tag_dict,
-            self.component_config,
+            data_signature=model_data.get_signature(),
+            label_data=self._label_data,
+            inverted_tag_dict=self.inverted_tag_dict,
+            config=self.component_config,
         )
 
         self.model.fit(

From 0ae0e58dc462852e73df237753ad3cd68401a2ef Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 08:15:25 +0100
Subject: [PATCH 573/633] fix incorrect import

---
 rasa/core/training/story_conflict.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/core/training/story_conflict.py b/rasa/core/training/story_conflict.py
index 2510608a68b8..d77452c4b990 100644
--- a/rasa/core/training/story_conflict.py
+++ b/rasa/core/training/story_conflict.py
@@ -1,12 +1,12 @@
 import logging
-from collections import defaultdict, namedtuple
+from collections import defaultdict
 from typing import List, Optional, Dict, Text, Tuple, Generator, NamedTuple
 
 from rasa.core.actions.action import ACTION_LISTEN_NAME
 from rasa.core.domain import PREV_PREFIX, Domain
 from rasa.core.events import ActionExecuted, Event
 from rasa.core.featurizers import MaxHistoryTrackerFeaturizer
-from rasa.nlu.constants import INTENT_ATTRIBUTE
+from rasa.nlu.constants import INTENT
 from rasa.core.training.generator import TrackerWithCachedStates
 
 logger = logging.getLogger(__name__)
@@ -25,7 +25,7 @@ class StoryConflict:
                                    prior events (i.e. at the beginning of a dialogue).
     """
 
-    def __init__(self, sliced_states: List[Optional[Dict[Text, float]]],) -> None:
+    def __init__(self, sliced_states: List[Optional[Dict[Text, float]]]) -> None:
         """
         Creates a `StoryConflict` from a given state.
 
@@ -314,10 +314,10 @@ def _get_previous_event(
         ):
             # The `prev_...` was an action that was NOT `action_listen`
             return "action", turn_label.replace(PREV_PREFIX, "")
-        elif turn_label.startswith(INTENT_ATTRIBUTE + "_"):
+        elif turn_label.startswith(INTENT + "_"):
             # We found an intent, but it is only the previous event if
             # the `prev_...` was `prev_action_listen`, so we don't return.
             previous_event_type = "intent"
-            previous_event_name = turn_label.replace(INTENT_ATTRIBUTE + "_", "")
+            previous_event_name = turn_label.replace(INTENT + "_", "")
 
     return previous_event_type, previous_event_name

From f914fbee344a95027270d6eaec5306534afd013f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 09:11:14 +0100
Subject: [PATCH 574/633] documentation review comments

---
 docs/migration-guide.rst         | 2 ++
 docs/nlu/choosing-a-pipeline.rst | 9 +++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index f0daa232efb9..de0c8827080d 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -90,6 +90,7 @@ General
       # ... any other parameters
 
   See :ref:`diet-classifier` for more information about the new component.
+  The classifier ``EmbeddingIntentClassifier`` still exists out of now and behaves the same as before.
 
 - ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. If you want to
   get the same model behaviour as the current ``CRFEntityExtractor``, you can use the following configuration:
@@ -127,6 +128,7 @@ General
   We extracted the featurization from the component into the new featurizer :ref:``LexicalSyntacticFeaturizer``. Thus,
   in order to obtain the same results as before, you need to add this featurizer to your pipeline before the
   :ref:``diet-classifier``.
+  The entity extractor ``CRFEntityExtractor`` still exists out of now and behaves the same as before.
 
 .. _migration-to-rasa-1.7:
 
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 7edb50c98e9f..be9b9681a9f2 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -89,11 +89,12 @@ Featurization
 ~~~~~~~~~~~~~
 You need to decide whether to use components that provide pre-trained word embeddings or not.
 
-If you do not use any pre-trained word embeddings, your word vectors will be customised for your domain. For example,
-in general English, the word "balance" is closely related to "symmetry", but very different to the word "cash". In a
-banking domain, "balance" and "cash" are closely related and you'd like your model to capture that. If you don't
+If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
+and can train your model to be more domain specific. For example, in general English, the word "balance" is closely
+related to "symmetry", but very different to the word "cash". In a banking domain, "balance" and "cash" are closely
+related and you'd like your model to capture that. If you don't
 use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language and domain.
-In those cases you should only use featurizers from the category `sparse` featurizers, such as
+In those cases you should only use featurizers from the category :ref:`sparse featurizers <text-featurizers>`, such as
 ``CountVectorsFeaturizer``, ``RegexFeaturizer`` or ``LexicalSyntacticFeaturizer``.
 
 The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:

From 13985f5e1389714da862160423d789ed8819e925 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 09:22:18 +0100
Subject: [PATCH 575/633] documentation review comments

---
 docs/nlu/choosing-a-pipeline.rst |  6 +++---
 docs/nlu/components.rst          | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index be9b9681a9f2..dfbce0d90e20 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -412,9 +412,9 @@ To train a Rasa model in your preferred language, define the
 .. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_1.yml
     :language: yaml
 
-The ``supervised_embeddings`` pipeline supports any language that can be tokenized.  By default it uses whitespace
-for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are the default
-components that make up the ``supervised_embeddings`` pipeline:
+The ``supervised_embeddings`` pipeline supports any language that can be whitespace tokenized. By default it uses
+whitespace for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are
+the default components that make up the ``supervised_embeddings`` pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_2.yml
     :language: yaml
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 3ad3f49b7e16..dc173fff03f9 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -330,10 +330,11 @@ MitieFeaturizer
 
     .. note::
 
-        NOT used by the ``MitieIntentClassifier`` component.
+        NOT used by the ``MitieIntentClassifier`` component. But can be used by any component later in the pipeline
+        that makes use of ``dense_features``.
 
 :Configuration:
-    The sentence vector, e.g. the vector of the ``__CLS__`` token can be calculated in two different ways, either via
+    The sentence vector, i.e. the vector of the ``__CLS__`` token, can be calculated in two different ways, either via
     mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
     The default pooling method is set to ``mean``.
 
@@ -360,7 +361,7 @@ SpacyFeaturizer
     Creates features for entity extraction, intent classification, and response classification using the spaCy
     featurizer.
 :Configuration:
-    The sentence vector, e.g. the vector of the ``__CLS__`` token can be calculated in two different ways, either via
+    The sentence vector, i.e. the vector of the ``__CLS__`` token, can be calculated in two different ways, either via
     mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
     The default pooling method is set to ``mean``.
 
@@ -730,7 +731,7 @@ EmbeddingIntentClassifier
 
 :Short: Embedding intent classifier for intent classification
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user messages and intent (optional)
+:Requires: ``dense_features`` and/or ``sparse_features`` for user messages, and optionally the intent
 :Output-Example:
 
     .. code-block:: json
@@ -1404,7 +1405,7 @@ DIETClassifier
 
 :Short: Dual Intent Entity Transformer (DIET) used for intent classification and entity extraction
 :Outputs: ``entities``, ``intent`` and ``intent_ranking``
-:Requires: ``dense_features`` and/or ``sparse_features`` for user message and intent (optional)
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and, and optionally the intent
 :Output-Example:
 
     .. code-block:: json

From 78b32cacd75c053334fe46de5a7ad9aa1a89e3f7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 09:38:31 +0100
Subject: [PATCH 576/633] documentation review comments

---
 docs/nlu/components.rst                 | 173 +++++++++++++-----------
 rasa/nlu/selectors/response_selector.py |  14 +-
 2 files changed, 98 insertions(+), 89 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index dc173fff03f9..a2390eeb9ad6 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1003,94 +1003,103 @@ ResponseSelector
     In addition, the component can also be configured to train a response selector for a particular retrieval intent.
 
         - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
+          Default is ``None``, i.e. the model is trained for all retrieval intents.
 
     Default values:
 
     .. code-block:: yaml
 
         pipeline:
-        - name: "ResponseSelector"
-            # nn architecture
-            # sizes of hidden layers before the embedding layer
-            # for input words and intent labels,
-            # the number of hidden layers is thus equal to the length of this list
-            "hidden_layers_sizes": {"text": [256, 128], "label": [256, 128]}
-            # Whether to share the hidden layer weights between input words and labels
-            "share_hidden_layers": False
-            # number of units in transformer
-            "transformer_size": None
-            # number of transformer layers
-            "number_of_transformer_layers": 0
-            # number of attention heads in transformer
-            "number_of_attention_heads": 4
-            # use a unidirectional or bidirectional encoder
-            "unidirectional_encoder": False
-            # if true use key relative embeddings in attention
-            "use_key_relative_attention": False
-            # if true use key relative embeddings in attention
-            "use_value_relative_attention": False
-            # max position for relative embeddings
-            "max_relative_position": None
-            # training parameters
-            # initial and final batch sizes - batch size will be
-            # linearly increased for each epoch
-            "batch_size": [64, 256]
-            # how to create batches
-            "batch_strategy": "balanced"  # string 'sequence' or 'balanced'
-            # number of epochs
-            "epochs": 300
-            # set random seed to any int to get reproducible results
-            "random_seed": None
-            # optimizer
-            "learning_rate": 0.001
-            # embedding parameters
-            # default dense dimension used if no dense features are present
-            "dense_dimension": {"text": 512, "label": 512}
-            # dimension size of embedding vectors
-            "embedding_dimension": 20
-            # the type of the similarity
-            "number_of_negative_examples": 20
-            # flag if minimize only maximum similarity over incorrect actions
-            "similarity_type": "auto"  # string 'auto' or 'cosine' or 'inner'
-            # the type of the loss function
-            "loss_type": "softmax"  # string 'softmax' or 'margin'
-            # number of top intents to normalize scores for softmax loss_type
-            # set to 0 to turn off normalization
-            "ranking_length": 10
-            # how similar the algorithm should try
-            # to make embedding vectors for correct labels
-            "maximum_positive_similarity": 0.8  # should be 0.0 < ... < 1.0 for 'cosine'
-            # maximum negative similarity for incorrect labels
-            "maximum_negative_similarity": -0.4  # should be -1.0 < ... < 1.0 for 'cosine'
-            # flag: if true, only minimize the maximum similarity for incorrect labels
-            "use_maximum_negative_similarity": True
-            # scale loss inverse proportionally to confidence of correct prediction
-            "scale_loss": True
-            # regularization parameters
-            # the scale of regularization
-            "regularization_constant": 0.002
-            # the scale of how critical the algorithm should be of minimizing the
-            # maximum similarity between embeddings of different labels
-            "negative_margin_scale": 0.8
-            # dropout rate for rnn
-            "droprate": 0.2
-            # dropout rate for attention
-            "droprate_attention": 0
-            # sparsity of the weights in dense layers
-            "weight_sparsity": 0.8
-            # if true apply dropout to sparse tensors
-            "use_sparse_input_dropout": True
-            # visualization of accuracy
-            # how often to calculate training accuracy
-            "evaluate_every_number_of_epochs": 20  # small values may hurt performance
-            # how many examples to use for calculation of training accuracy
-            "evaluate_on_number_of_examples": 0  # large values may hurt performance
-            # if true random tokens of the input message will be masked and the model
-            # should predict those tokens
-            "use_masked_language_model": False
-            # selector config
-            # name of the intent for which this response selector is to be trained
-            "retrieval_intent": None
+          - name: "ResponseSelector"
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
+            hidden_layers_sizes: {"text": [256, 128], "label": [256, 128]},
+            # Whether to share the hidden layer weights between input words and responses
+            "share_hidden_layers": False,
+            # Number of units in transformer
+            "transformer_size": None,
+            # Number of transformer layers
+            "number_of_transformer_layers": 0,
+            # Number of attention heads in transformer
+            "number_of_attention_heads": 4,
+            # If 'True' use key relative embeddings in attention
+            "use_key_relative_attention": False,
+            # If 'True' use key relative embeddings in attention
+            "use_value_relative_attention": False,
+            # Max position for relative embeddings
+            "max_relative_position": None,
+            # Use a unidirectional or bidirectional encoder.
+            "unidirectional_encoder": False,
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
+            "batch_size": [64, 256],
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced",
+            # Number of epochs to train
+            "epochs": 300,
+            # Set random seed to any 'int' to get reproducible results
+            "random_seed": None,
+            # Initial learning rate for the optimizer
+            "learning_rate": 0.001,
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
+            "embedding_dimension": 20,
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {TEXT: 512, LABEL: 512},
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
+            "number_of_negative_examples": 20,
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto",
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax",
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
+            "ranking_length": 10,
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8,
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4,
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+            "use_maximum_negative_similarity": True,
+            # Scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True,
+            # ## Regularization parameters
+            # The scale of regularization
+            "regularization_constant": 0.002,
+            # Sparsity of the weights in dense layers
+            "weight_sparsity": 0.8,
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
+            "negative_margin_scale": 0.8,
+            # Dropout rate for encoder
+            "drop_rate": 0.2,
+            # Dropout rate for attention
+            "drop_rate_attention": 0,
+            # If 'True' apply dropout to sparse tensors
+            "use_sparse_input_dropout": False,
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20,
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0,
+            # ## Selector config
+            # If 'True' random tokens of the input message will be masked and the model
+            # should predict those tokens.
+            "use_masked_language_model": False,
+            # Name of the intent for which this response selector is to be trained
+            "retrieval_intent: None,
 
 
 Entity Extractors
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 1910f7a2148e..f9dd2c7d4663 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -93,19 +93,19 @@ def required_components(cls) -> List[Type[Component]]:
         HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
         # Whether to share the hidden layer weights between input words and responses
         SHARE_HIDDEN_LAYERS: False,
-        # number of units in transformer
+        # Number of units in transformer
         TRANSFORMER_SIZE: None,
-        # number of transformer layers
+        # Number of transformer layers
         NUM_TRANSFORMER_LAYERS: 0,
-        # number of attention heads in transformer
+        # Number of attention heads in transformer
         NUM_HEADS: 4,
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
         KEY_RELATIVE_ATTENTION: False,
-        # if true use key relative embeddings in attention
+        # If 'True' use key relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # max position for relative embeddings
+        # Max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
-        # use a unidirectional or bidirectional encoder
+        # Use a unidirectional or bidirectional encoder.
         UNIDIRECTIONAL_ENCODER: False,
         # ## Training parameters
         # Initial and final batch sizes:

From b7586701052ba8fc2df45bcfc3e44272c4d356fd Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 10:41:31 +0100
Subject: [PATCH 577/633] documentation review comments

---
 docs/nlu/choosing-a-pipeline.rst  |  5 +++--
 docs/nlu/components.rst           | 16 +++++++++-------
 examples/restaurantbot/config.yml | 15 ---------------
 3 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index dfbce0d90e20..e4b711650171 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -56,7 +56,7 @@ If your training data is not in English, we recommend to use the following pipel
     :language: yaml
 
 It uses the :ref:`SpacyFeaturizer` instead of the :ref:`ConveRTFeaturizer`.
-``SpacyFeaturizer`` provides pre-trained word embeddings in many different languages
+``SpacyFeaturizer`` provides pre-trained word embeddings from either GloVe or fastText in many different languages
 (see :ref:`pretrained-word-vectors`).
 
 .. note::
@@ -131,7 +131,8 @@ So, this featurizer can also be an alternate option depending on the language of
 
 Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Depending on your data you may want to only perform intent classification or entity recognition.
+Depending on your data you may want to only perform intent classification, entity recognition or response selection.
+Or you might want to combine multiple of those tasks.
 We support several components for each of the task. All of them are listed in :ref:`components`.
 We recommend to use :ref:`diet-classifier` for intent classification and entity recognition
 and :ref:`response-selector` for response selection.
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index a2390eeb9ad6..ad991ffc370d 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -827,11 +827,12 @@ EmbeddingIntentClassifier
 
         - regularization:
 
-            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``regularization_constant`` sets the scale of L2 regularization. Higher values will result in more
+              regularization.
             - ``negative_margin_scale`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different intent labels.
             - ``drop_rate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
+              between ``0`` and ``1``, e.g. ``drop_rate=0.1`` would drop out ``10%`` of input units.
             - ``weight_sparsity`` sets the sparsity of the weght kernels in dense layers.
             - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
@@ -1522,13 +1523,14 @@ DIETClassifier
 
         - regularization:
 
-            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``regularization_constant`` sets the scale of L2 regularization. Higher values will result in more
+              regularization.
             - ``negative_margin_scale`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different intent labels.
-            - ``droprate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1`` would drop out ``10%`` of input units.
-            - ``droprate_attention`` sets the dropout rate for attention, it should be
-              between ``0`` and ``1``, e.g. ``droprate_attention=0.1`` would drop out ``10%`` of input units.
+            - ``drop_rate`` sets the dropout rate, it should be
+              between ``0`` and ``1``, e.g. ``drop_rate=0.1`` would drop out ``10%`` of input units.
+            - ``drop_rate_attention`` sets the dropout rate for attention, it should be
+              between ``0`` and ``1``, e.g. ``drop_rate_attention=0.1`` would drop out ``10%`` of input units.
             - ``weight_sparsity`` sets the sparsity of weight kernels in dense layers.
             - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index 291570d0b96f..9bc91e7df050 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -7,21 +7,6 @@ pipeline:
   - name: "LexicalSyntacticFeaturizer"
   - name: "DIETClassifier"
     epochs: 100
-    features: [
-      ["low", "title", "upper"],
-      [
-        "low",
-        "prefix5",
-        "prefix2",
-        "suffix5",
-        "suffix3",
-        "suffix2",
-        "upper",
-        "title",
-        "digit",
-      ],
-      ["low", "title", "upper"],
-    ]
   - name: "EntitySynonymMapper"
 
 policies:

From 1cb7c53d709c8656d1262968154b343d3b45f187 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 10:43:31 +0100
Subject: [PATCH 578/633] substitute loss and sim strings with constants

---
 rasa/core/policies/embedding_policy.py        |  6 ++++--
 rasa/core/policies/ted_policy.py              | 17 ++++++++++------
 rasa/nlu/classifiers/diet_classifier.py       |  8 +++++---
 .../embedding_intent_classifier.py            |  6 ++++--
 rasa/nlu/selectors/response_selector.py       |  6 ++++--
 rasa/utils/tensorflow/constants.py            |  6 +++++-
 rasa/utils/tensorflow/layers.py               | 16 +++++++--------
 rasa/utils/train_utils.py                     | 20 +++++++++++--------
 8 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 775db22e295a..644e49101d4f 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -34,6 +34,8 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
+    SOFTMAX,
+    AUTO,
 )
 from rasa.utils.tensorflow.models import RasaModel
 import rasa.utils.common as common_utils
@@ -97,9 +99,9 @@ class EmbeddingPolicy(TEDPolicy):
         # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
-        SIMILARITY_TYPE: "auto",
+        SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: "softmax",
+        LOSS_TYPE: SOFTMAX,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 40912404ef27..a6fda89d04e4 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -55,7 +55,8 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
-    EVALUATE_ONCE_PER_EPOCH,
+    SOFTMAX,
+    AUTO,
 )
 
 
@@ -123,9 +124,9 @@ class TEDPolicy(Policy):
         # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
-        SIMILARITY_TYPE: "auto",
+        SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: "softmax",
+        LOSS_TYPE: SOFTMAX,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
@@ -208,7 +209,10 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
     # noinspection PyPep8Naming
     @staticmethod
     def _label_ids_for_Y(data_Y: np.ndarray) -> np.ndarray:
-        """Prepare Y data for training: extract label_ids."""
+        """Prepare Y data for training: extract label_ids.
+
+        label_ids are indices of labels, while `data_Y` contains one-hot encodings.
+        """
 
         return data_Y.argmax(axis=-1)
 
@@ -270,7 +274,6 @@ def _create_label_data(self, domain: Domain) -> RasaModelData:
         label_data.add_features("label_features", [all_labels])
         return label_data
 
-    # training methods
     def train(
         self,
         training_trackers: List[DialogueStateTracker],
@@ -330,6 +333,7 @@ def predict_action_probabilities(
 
         Return the list of probabilities for the next actions.
         """
+
         if self.model is None:
             return self._default_predictions(domain)
 
@@ -340,9 +344,10 @@ def predict_action_probabilities(
         output = self.model.predict(model_data)
 
         confidence = output["action_scores"].numpy()
+        # remove batch dimension and take the last prediction in the sequence
         confidence = confidence[0, -1, :]
 
-        if self.config[LOSS_TYPE] == "softmax" and self.config[RANKING_LENGTH] > 0:
+        if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
             confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
 
         return confidence.tolist()
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 5eb69682977b..bd6797e6c901 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -74,6 +74,8 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
+    SOFTMAX,
+    AUTO,
 )
 
 
@@ -143,9 +145,9 @@ def required_components(cls) -> List[Type[Component]]:
         # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
-        SIMILARITY_TYPE: "auto",
+        SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: "softmax",
+        LOSS_TYPE: SOFTMAX,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
@@ -686,7 +688,7 @@ def _predict_label(
         label_ids = message_sim.argsort()[::-1]
 
         if (
-            self.component_config[LOSS_TYPE] == "softmax"
+            self.component_config[LOSS_TYPE] == SOFTMAX
             and self.component_config[RANKING_LENGTH] > 0
         ):
             message_sim = train_utils.normalize(
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d36adf590e69..5d8592797f5b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -36,6 +36,8 @@
     MAX_POS_SIM,
     EMBEDDING_DIMENSION,
     BILOU_FLAG,
+    SOFTMAX,
+    AUTO,
 )
 import rasa.utils.common as common_utils
 from rasa.utils.tensorflow.models import RasaModel
@@ -91,9 +93,9 @@ def required_components(cls) -> List[Type[Component]]:
         # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
-        SIMILARITY_TYPE: "auto",
+        SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: "softmax",
+        LOSS_TYPE: SOFTMAX,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index f9dd2c7d4663..891f35d412ce 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -48,6 +48,8 @@
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
     RETRIEVAL_INTENT,
+    SOFTMAX,
+    AUTO,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -129,9 +131,9 @@ def required_components(cls) -> List[Type[Component]]:
         # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
-        SIMILARITY_TYPE: "auto",
+        SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: "softmax",
+        LOSS_TYPE: SOFTMAX,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index a3425cbc8664..0870f4655235 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -53,4 +53,8 @@
 
 RETRIEVAL_INTENT = "retrieval_intent"
 
-EVALUATE_ONCE_PER_EPOCH = -1
+SOFTMAX = "softmax"
+MARGIN = "margin"
+AUTO = "auto"
+INNER = "inner"
+COSINE = "cosine"
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 12e3186fdd5a..f1f2a5ed9932 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -4,7 +4,7 @@
 import tensorflow_addons as tfa
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import initializers
+from rasa.utils.tensorflow.constants import SOFTMAX, MARGIN, COSINE, INNER
 
 logger = logging.getLogger(__name__)
 
@@ -139,10 +139,10 @@ def __init__(
         super().__init__(name=f"embed_{layer_name_suffix}")
 
         self.similarity_type = similarity_type
-        if self.similarity_type and self.similarity_type not in {"cosine", "inner"}:
+        if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
             raise ValueError(
                 f"Wrong similarity type '{self.similarity_type}', "
-                f"should be 'cosine' or 'inner'"
+                f"should be '{COSINE}' or '{INNER}'"
             )
 
         regularizer = tf.keras.regularizers.l2(reg_lambda)
@@ -155,7 +155,7 @@ def __init__(
 
     def call(self, x: tf.Tensor) -> tf.Tensor:
         x = self._dense(x)
-        if self.similarity_type == "cosine":
+        if self.similarity_type == COSINE:
             x = tf.nn.l2_normalize(x, -1)
 
         return x
@@ -409,7 +409,7 @@ def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tens
 
     @staticmethod
     def confidence_from_sim(sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
-        if similarity_type == "cosine":
+        if similarity_type == COSINE:
             # clip negative values to zero
             return tf.nn.relu(sim)
         else:
@@ -555,14 +555,14 @@ def _loss_softmax(
     def _chosen_loss(self) -> Callable:
         """Use loss depending on given option."""
 
-        if self.loss_type == "margin":
+        if self.loss_type == MARGIN:
             return self._loss_margin
-        elif self.loss_type == "softmax":
+        elif self.loss_type == SOFTMAX:
             return self._loss_softmax
         else:
             raise ValueError(
                 f"Wrong loss type '{self.loss_type}', "
-                f"should be 'margin' or 'softmax'"
+                f"should be '{MARGIN}' or '{SOFTMAX}'"
             )
 
     def call(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1dddf3543b34..9fd9bbc668fe 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -28,7 +28,11 @@
     NEGATIVE_MARGIN_SCALE,
     DROP_RATE,
     EPOCHS,
-    EVALUATE_ONCE_PER_EPOCH,
+    SOFTMAX,
+    MARGIN,
+    AUTO,
+    INNER,
+    COSINE,
 )
 
 
@@ -60,11 +64,11 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
     Returns: updated model configuration
     """
-    if config.get(SIMILARITY_TYPE) == "auto":
-        if config[LOSS_TYPE] == "softmax":
-            config[SIMILARITY_TYPE] = "inner"
-        elif config[LOSS_TYPE] == "margin":
-            config[SIMILARITY_TYPE] = "cosine"
+    if config.get(SIMILARITY_TYPE) == AUTO:
+        if config[LOSS_TYPE] == SOFTMAX:
+            config[SIMILARITY_TYPE] = INNER
+        elif config[LOSS_TYPE] == MARGIN:
+            config[SIMILARITY_TYPE] = COSINE
 
     return config
 
@@ -123,7 +127,7 @@ def sequence_to_sentence_features(
 
 def update_evaluation_parameters(config: Dict[Text, Any]) -> Dict[Text, Any]:
     """
-    If EVAL_NUM_EPOCHS is set to -1, evaluate at the end of every epoch.
+    If EVAL_NUM_EPOCHS is set to -1, evaluate at the end of the training.
 
     Args:
         config: model configuration
@@ -131,7 +135,7 @@ def update_evaluation_parameters(config: Dict[Text, Any]) -> Dict[Text, Any]:
     Returns: updated model configuration
     """
 
-    if config[EVAL_NUM_EPOCHS] == EVALUATE_ONCE_PER_EPOCH:
+    if config[EVAL_NUM_EPOCHS] == -1:
         config[EVAL_NUM_EPOCHS] = config[EPOCHS]
     elif config[EVAL_NUM_EPOCHS] < 1:
         raise ValueError(

From 12c208b1875ba77d37362eb539c0b78221a9f12f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 10:47:09 +0100
Subject: [PATCH 579/633] fix doc warnings

---
 docs/nlu/components.rst | 68 ++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index ad991ffc370d..477206625e44 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1017,90 +1017,90 @@ ResponseSelector
             # and labels.
             # The number of hidden layers is equal to the length of the corresponding
             # list.
-            hidden_layers_sizes: {"text": [256, 128], "label": [256, 128]},
+            hidden_layers_sizes: {"text": [256, 128], "label": [256, 128]}
             # Whether to share the hidden layer weights between input words and responses
-            "share_hidden_layers": False,
+            "share_hidden_layers": False
             # Number of units in transformer
-            "transformer_size": None,
+            "transformer_size": None
             # Number of transformer layers
-            "number_of_transformer_layers": 0,
+            "number_of_transformer_layers": 0
             # Number of attention heads in transformer
-            "number_of_attention_heads": 4,
+            "number_of_attention_heads": 4
             # If 'True' use key relative embeddings in attention
-            "use_key_relative_attention": False,
+            "use_key_relative_attention": False
             # If 'True' use key relative embeddings in attention
-            "use_value_relative_attention": False,
+            "use_value_relative_attention": False
             # Max position for relative embeddings
-            "max_relative_position": None,
+            "max_relative_position": None
             # Use a unidirectional or bidirectional encoder.
-            "unidirectional_encoder": False,
+            "unidirectional_encoder": False
             # ## Training parameters
             # Initial and final batch sizes:
             # Batch size will be linearly increased for each epoch.
-            "batch_size": [64, 256],
+            "batch_size": [64, 256]
             # Strategy used when creating batches.
             # Can be either 'sequence' or 'balanced'.
-            "batch_strategy": "balanced",
+            "batch_strategy": "balanced"
             # Number of epochs to train
-            "epochs": 300,
+            "epochs": 300
             # Set random seed to any 'int' to get reproducible results
-            "random_seed": None,
+            "random_seed": None
             # Initial learning rate for the optimizer
-            "learning_rate": 0.001,
+            "learning_rate": 0.001
             # ## Parameters for embeddings
             # Dimension size of embedding vectors
-            "embedding_dimension": 20,
+            "embedding_dimension": 20
             # Default dense dimension to use if no dense features are present.
-            "dense_dimension": {TEXT: 512, LABEL: 512},
+            "dense_dimension": {"text": 512, "label": 512}
             # The number of incorrect labels. The algorithm will minimize
             # their similarity to the user input during training.
-            "number_of_negative_examples": 20,
+            "number_of_negative_examples": 20
             # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
-            "similarity_type": "auto",
+            "similarity_type": "auto"
             # The type of the loss function, either 'softmax' or 'margin'.
-            "loss_type": "softmax",
+            "loss_type": "softmax"
             # Number of top actions to normalize scores for loss type 'softmax'.
             # Set to 0 to turn off normalization.
-            "ranking_length": 10,
+            "ranking_length": 10
             # Indicates how similar the algorithm should try to make embedding vectors
             # for correct labels.
             # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
-            "maximum_positive_similarity": 0.8,
+            "maximum_positive_similarity": 0.8
             # Maximum negative similarity for incorrect labels.
             # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
-            "maximum_negative_similarity": -0.4,
+            "maximum_negative_similarity": -0.4
             # If 'True' the algorithm only minimizes maximum similarity over
             # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
-            "use_maximum_negative_similarity": True,
+            "use_maximum_negative_similarity": True
             # Scale loss inverse proportionally to confidence of correct prediction
-            "scale_loss": True,
+            "scale_loss": True
             # ## Regularization parameters
             # The scale of regularization
-            "regularization_constant": 0.002,
+            "regularization_constant": 0.002
             # Sparsity of the weights in dense layers
-            "weight_sparsity": 0.8,
+            "weight_sparsity": 0.8
             # The scale of how important is to minimize the maximum similarity
             # between embeddings of different labels.
-            "negative_margin_scale": 0.8,
+            "negative_margin_scale": 0.8
             # Dropout rate for encoder
-            "drop_rate": 0.2,
+            "drop_rate": 0.2
             # Dropout rate for attention
-            "drop_rate_attention": 0,
+            "drop_rate_attention": 0
             # If 'True' apply dropout to sparse tensors
-            "use_sparse_input_dropout": False,
+            "use_sparse_input_dropout": False
             # ## Evaluation parameters
             # How often calculate validation accuracy.
             # Small values may hurt performance, e.g. model accuracy.
-            "evaluate_every_number_of_epochs": 20,
+            "evaluate_every_number_of_epochs": 20
             # How many examples to use for hold out validation set
             # Large values may hurt performance, e.g. model accuracy.
-            "evaluate_on_number_of_examples": 0,
+            "evaluate_on_number_of_examples": 0
             # ## Selector config
             # If 'True' random tokens of the input message will be masked and the model
             # should predict those tokens.
-            "use_masked_language_model": False,
+            "use_masked_language_model": False
             # Name of the intent for which this response selector is to be trained
-            "retrieval_intent: None,
+            "retrieval_intent: None
 
 
 Entity Extractors

From 450427168ff8ecdd3fdd7ea4a4444ea95230751f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 10:53:49 +0100
Subject: [PATCH 580/633] address rasa init problems

---
 rasa/cli/initial_project/config.yml     |  2 ++
 rasa/nlu/classifiers/diet_classifier.py | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/rasa/cli/initial_project/config.yml b/rasa/cli/initial_project/config.yml
index dedb0b714890..e22e49c76008 100644
--- a/rasa/cli/initial_project/config.yml
+++ b/rasa/cli/initial_project/config.yml
@@ -11,8 +11,10 @@ pipeline:
     min_ngram: 1
     max_ngram: 4
   - name: DIETClassifier
+    epochs: 100
   - name: EntitySynonymMapper
   - name: ResponseSelector
+    epochs: 100
 
 # Configuration for Rasa Core.
 # https://rasa.com/docs/rasa/core/policies/
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index bd6797e6c901..3fe1ca527492 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -622,8 +622,8 @@ def train(
 
         model_data = self.preprocess_train_data(training_data)
         if model_data.is_empty():
-            logger.error(
-                f"Can not train '{self.__class__.__name__}'. No data was provided. "
+            logger.debug(
+                f"Cannot train '{self.__class__.__name__}'. No data was provided. "
                 f"Skipping training of the classifier."
             )
             return
@@ -631,7 +631,7 @@ def train(
         if self.component_config.get(INTENT_CLASSIFICATION):
             if not self._check_enough_labels(model_data):
                 logger.error(
-                    f"Can not train '{self.__class__.__name__}'. "
+                    f"Cannot train '{self.__class__.__name__}'. "
                     f"Need at least 2 different intent classes. "
                     f"Skipping training of classifier."
                 )
@@ -659,7 +659,7 @@ def train(
     # process helpers
     def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
         if self.model is None:
-            logger.error(
+            logger.debug(
                 "There is no trained model: component is either not trained or "
                 "didn't receive enough training data."
             )
@@ -839,8 +839,8 @@ def load(
         """Loads the trained model from the provided directory."""
 
         if not model_dir or not meta.get("file"):
-            warnings.warn(
-                f"Failed to load nlu model. "
+            logger.debug(
+                f"Failed to load model. "
                 f"Maybe the path '{os.path.abspath(model_dir)}' doesn't exist?"
             )
             return cls(component_config=meta)

From 2175f2762b613d19ac84cc56619e1a7ae125981b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 11:13:53 +0100
Subject: [PATCH 581/633] documentation review comments

---
 changelog/5187.feature.rst       | 10 +++++-----
 changelog/5266.feature.rst       | 12 +++++++-----
 docs/migration-guide.rst         |  6 ++++--
 docs/nlu/choosing-a-pipeline.rst | 10 +++++-----
 docs/nlu/components.rst          |  2 ++
 5 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/changelog/5187.feature.rst b/changelog/5187.feature.rst
index 0ddcb3596438..c18bc7efdfdd 100644
--- a/changelog/5187.feature.rst
+++ b/changelog/5187.feature.rst
@@ -1,7 +1,7 @@
 Integrate language models from HuggingFace's `Transformers <https://github.com/huggingface/transformers>`_ Library.
 
-Add a new NLP component :ref:`HFTransformersNLP <HFTransformersNLP>` which
-tokenizes and featurizes incoming messages using a specified pre-trained model with the Transformers library as the backend.
-Add ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from ``HFTransformersNLP``
-and sets them correctly for message object.
-Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa
+Add a new NLP component :ref:`HFTransformersNLP` which tokenizes and featurizes incoming messages using a specified
+pre-trained model with the Transformers library as the backend.
+Add :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` which use the information from
+:ref:`HFTransformersNLP` and sets them correctly for message object.
+Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa.
diff --git a/changelog/5266.feature.rst b/changelog/5266.feature.rst
index f9be522ddeab..e876d7d0d51d 100644
--- a/changelog/5266.feature.rst
+++ b/changelog/5266.feature.rst
@@ -1,12 +1,14 @@
-Added a new NLU component ``DIETClassifier`` and a new policy ``TEDPolicy``.
+Added a new NLU component :ref:`DIETClassifier <diet-classifier>` and a new policy :ref:`TEDPolicy <ted-policy>`.
 
 DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
 recognition. You can read more about this component in our :ref:`documentation <diet-classifier>`.
-The new component will replace the ``EmbeddingIntentClassifier`` and the ``CRFEntityExtractor`` in the future.
+The new component will replace the :ref:`EmbeddingIntentClassifier <embedding-intent-classifier>` and the
+:ref:`CRFEntityExtractor` in the future.
 Those two components are deprecated from now on.
 See :ref:`migration guide <migration-to-rasa-1.8>` for details on how to
 switch to the new component.
 
-``TEDPolicy`` is the new name for ``EmbeddingPolicy``. ``EmbeddingPolicy`` is deprecated from now on.
-The functionality of ``TEDPolicy`` and ``EmbeddingPolicy`` is the same. Please update your configuration file
-to use the new name for the policy.
+:ref:`TEDPolicy <ted-policy>` is the new name for :ref:`EmbeddingPolicy <embedding_policy>`.
+``EmbeddingPolicy`` is deprecated from now on.
+The functionality of ``TEDPolicy`` and ``EmbeddingPolicy`` is the same.
+Please update your configuration file to use the new name for the policy.
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index de0c8827080d..965722ab702c 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -90,7 +90,8 @@ General
       # ... any other parameters
 
   See :ref:`diet-classifier` for more information about the new component.
-  The classifier ``EmbeddingIntentClassifier`` still exists out of now and behaves the same as before.
+  Specifying ``EmbeddingIntentClassifier`` in the configuration maps to the above component definition, the
+  behaviour is unchanged from previous versions.
 
 - ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. If you want to
   get the same model behaviour as the current ``CRFEntityExtractor``, you can use the following configuration:
@@ -128,7 +129,8 @@ General
   We extracted the featurization from the component into the new featurizer :ref:``LexicalSyntacticFeaturizer``. Thus,
   in order to obtain the same results as before, you need to add this featurizer to your pipeline before the
   :ref:``diet-classifier``.
-  The entity extractor ``CRFEntityExtractor`` still exists out of now and behaves the same as before.
+   Specifying ``CRFEntityExtractor`` in the configuration maps to the above component definition, the behaviour
+  is unchanged from previous versions.
 
 .. _migration-to-rasa-1.7:
 
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index e4b711650171..d6a0b52c5491 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -17,7 +17,7 @@ it on your dataset.
     We deprecated all existing pipeline templates, e.g.
     :ref:`supervised_embeddings <section_supervised_embeddings_pipeline>`,
     :ref:`pretrained_embeddings_spacy <section_pretrained_embeddings_spacy_pipeline>` and
-    :ref:`pretrained_embeddings_convert <section_pretrained_embeddings_convert_pipeline>`. Please, list any
+    :ref:`pretrained_embeddings_convert <section_pretrained_embeddings_convert_pipeline>`. Please list any
     components you want to use directly in the configuration file.
 
 The Short Answer
@@ -37,7 +37,7 @@ In case your training data is in a different language than English, use the foll
 A Longer Answer
 ---------------
 
-We recommend to use the following pipeline, if your training data is in English:
+We recommend using following pipeline, if your training data is in English:
 
 .. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
     :language: yaml
@@ -50,7 +50,7 @@ This is especially useful if you don’t have enough training data.
 The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence.
 However, ``ConveRT`` is only available in English.
-If your training data is not in English, we recommend to use the following pipeline:
+If your training data is not in English, we recommend using the following pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/default_config.yml
     :language: yaml
@@ -134,7 +134,7 @@ Entity Recognition / Intent Classification / Response Selectors
 Depending on your data you may want to only perform intent classification, entity recognition or response selection.
 Or you might want to combine multiple of those tasks.
 We support several components for each of the task. All of them are listed in :ref:`components`.
-We recommend to use :ref:`diet-classifier` for intent classification and entity recognition
+We recommend using :ref:`diet-classifier` for intent classification and entity recognition
 and :ref:`response-selector` for response selection.
 
 Comparing different pipelines for your data
@@ -181,7 +181,7 @@ You'll also need to define these flags in whichever tokenizer you are using:
     - ``intent_tokenization_flag``: Set it to ``True``, so that intent labels are tokenized.
     - ``intent_split_symbol``: Set it to the delimiter string that splits the intent labels. Default ``_``.
 
-Read a `tutotiral <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__
+Read a `tutorial <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__
 on how to use multiple intents in Rasa.
 
 Here's an example configuration:
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 477206625e44..62e90514dfb1 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -1228,6 +1228,8 @@ EntitySynonymMapper
         pipeline:
         - name: "EntitySynonymMapper"
 
+.. _CRFEntityExtractor:
+
 CRFEntityExtractor
 ~~~~~~~~~~~~~~~~~~
 

From 8e6e586c10685808c83fac72334ca1e6626df3ab Mon Sep 17 00:00:00 2001
From: akelad <akela@rasa.com>
Date: Tue, 25 Feb 2020 11:44:55 +0100
Subject: [PATCH 582/633] fix formatting error

---
 docs/migration-guide.rst         | 13 ++++++++++---
 docs/nlu/choosing-a-pipeline.rst | 12 ++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 965722ab702c..4a9d2a77d73b 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -15,6 +15,11 @@ how you can migrate from one version to another.
 
 Rasa 1.7 to Rasa 1.8
 --------------------
+.. warning::
+
+  This is a release **breaking backwards compatibility**.
+  It is not possible to load previously trained models. Please make sure to retrain a
+  model before trying to use it with this improved version.
 
 General
 ~~~~~~~
@@ -36,8 +41,10 @@ General
   ``epochs``. ``max_history`` is particularly important and strongly depends on your stories.
   Please see the docs of the :ref:`ted_policy` if you want to customize them.
 
-- All pre-defined pipeline templates are deprecated. Take a look at :ref:`choosing-a-pipeline`
-  to decide on what components you should use in your configuration file.
+- All pre-defined pipeline templates are deprecated. **Any templates you use will be
+  mapped to the new configuration, but the underlying architecture is the same**.
+  Take a look at :ref:`choosing-a-pipeline` to decide on what components you should use
+  in your configuration file.
 
 - The :ref:`embedding_policy` was renamed to :ref:`ted_policy`. The functionality of the policy stayed the same.
   Please update your configuration files to use ``TEDPolicy`` instead of ``EmbeddingPolicy``.
@@ -129,7 +136,7 @@ General
   We extracted the featurization from the component into the new featurizer :ref:``LexicalSyntacticFeaturizer``. Thus,
   in order to obtain the same results as before, you need to add this featurizer to your pipeline before the
   :ref:``diet-classifier``.
-   Specifying ``CRFEntityExtractor`` in the configuration maps to the above component definition, the behaviour
+  Specifying ``CRFEntityExtractor`` in the configuration maps to the above component definition, the behaviour
   is unchanged from previous versions.
 
 .. _migration-to-rasa-1.7:
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index d6a0b52c5491..71ffff1ef193 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -123,7 +123,7 @@ is pre-trained in the language specific to your training data.
 For example, there are chinese (``bert-base-chinese``) and japanese (``bert-base-japanese``) variants of the BERT model.
 A full list of different variants of
 these language models is available in the
-`official documentation of the transformers library <https://huggingface.co/transformers/pretrained_models.html>_`.
+`official documentation of the transformers library <https://huggingface.co/transformers/pretrained_models.html>`_.
 
 ``SpacyFeaturizer`` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`),
 so you can use this as another alternative, depending on the language of your training data.
@@ -419,14 +419,14 @@ the default components that make up the ``supervised_embeddings`` pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_2.yml
     :language: yaml
-    
+
 So for example, if your chosen language is not whitespace-tokenized (words are not separated by spaces), you
 can replace the ``WhitespaceTokenizer`` with your own tokenizer. We support a number of different :ref:`tokenizers <tokenizers>`,
 or you can :ref:`create your own <custom-nlu-components>`.
 
-The pipeline uses two instances of ``CountVectorsFeaturizer``. The first one 
-featurizes text based on words. The second one featurizes text based on character 
-n-grams, preserving word boundaries. We empirically found the second featurizer 
+The pipeline uses two instances of ``CountVectorsFeaturizer``. The first one
+featurizes text based on words. The second one featurizes text based on character
+n-grams, preserving word boundaries. We empirically found the second featurizer
 to be more powerful, but we decided to keep the first featurizer as well to make
 featurization more robust.
 
@@ -451,4 +451,4 @@ Another version of this pipeline uses MITIE's featurizer and also its multi-clas
 Training can be quite slow, so this is not recommended for large datasets.
 
 .. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
-    :language: yaml
\ No newline at end of file
+    :language: yaml

From c22adfff596913e159fc76cd4fd93a4bc38f9639 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 12:16:41 +0100
Subject: [PATCH 583/633] update migration guide

---
 docs/migration-guide.rst | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 4a9d2a77d73b..11463a6e3395 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -79,7 +79,8 @@ General
   Old configuration options will be mapped to the new names, and a warning will be thrown.
   However, these will be deprecated in a future release.
 
-- :ref:`embedding-intent-classifier` is now deprecated and will be replaced by :ref:`diet-classifier` in the future.
+- :ref:`embedding-intent-classifier` is now deprecated and will be replaced by :ref:`DIETClassifier <diet-classifier>`
+  in the future.
   ``DIETClassfier`` performs intent classification as well as entity recognition.
   If you want to get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use
   the following configuration of ``DIETClassifier``:
@@ -96,7 +97,7 @@ General
       number_of_transformer_layers: 0
       # ... any other parameters
 
-  See :ref:`diet-classifier` for more information about the new component.
+  See :ref:`DIETClassifier <diet-classifier>` for more information about the new component.
   Specifying ``EmbeddingIntentClassifier`` in the configuration maps to the above component definition, the
   behaviour is unchanged from previous versions.
 
@@ -139,6 +140,35 @@ General
   Specifying ``CRFEntityExtractor`` in the configuration maps to the above component definition, the behaviour
   is unchanged from previous versions.
 
+- If your pipeline contains ``CRFEntityExtractor`` and ``EmbeddingIntentClassifier`` you can substitute both
+  components with :ref:`DIETClassifier <diet-classifier>`. You can use the following pipeline for that:
+
+  .. code-block:: yaml
+
+    pipeline:
+    # - ... other components
+    - name: LexicalSyntacticFeaturizer
+      features: [
+        ["low", "title", "upper"],
+        [
+          "BOS",
+          "EOS",
+          "low",
+          "prefix5",
+          "prefix2",
+          "suffix5",
+          "suffix3",
+          "suffix2",
+          "upper",
+          "title",
+          "digit",
+        ],
+        ["low", "title", "upper"],
+      ]
+    - name: DIETClassifier
+      number_of_transformer_layers: 0
+      # ... any other parameters
+
 .. _migration-to-rasa-1.7:
 
 Rasa 1.6 to Rasa 1.7

From 81caf9e079b4cab7738150676cb69239919eec70 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 13:05:14 +0100
Subject: [PATCH 584/633] modify comments

---
 rasa/nlu/classifiers/diet_classifier.py | 45 ++++++++++++-------------
 rasa/nlu/selectors/response_selector.py |  2 +-
 rasa/nlu/utils/bilou_utils.py           | 11 ++++--
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index bd6797e6c901..c7f54477083a 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -308,22 +308,22 @@ def _create_label_id_dict(
         }
 
     def _create_tag_id_dict(self, training_data: TrainingData) -> Dict[Text, int]:
-        """Create label_id dictionary"""
+        """Create tag_id dictionary"""
 
         if self.component_config[BILOU_FLAG]:
             return bilou_utils.build_tag_id_dict(training_data)
 
         distinct_tag_ids = set(
-            [
-                e["entity"]
-                for example in training_data.entity_examples
-                for e in example.get(ENTITIES)
-            ]
+            e["entity"]
+            for example in training_data.entity_examples
+            for e in example.get(ENTITIES)
         ) - {None}
 
         tag_id_dict = {
             tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
         }
+        # "O" corresponds to non-entity which should correspond to 0 index
+        # needed for correct prediction for padding
         tag_id_dict["O"] = 0
 
         return tag_id_dict
@@ -351,15 +351,13 @@ def _find_example_for_tag(
     def _check_labels_features_exist(
         labels_example: List[Message], attribute: Text
     ) -> bool:
-        """Check if all labels have features set"""
+        """Checks if all labels have features set."""
 
-        for label_example in labels_example:
-            if (
-                label_example.get(SPARSE_FEATURE_NAMES[attribute]) is None
-                and label_example.get(DENSE_FEATURE_NAMES[attribute]) is None
-            ):
-                return False
-        return True
+        return all(
+            label_example.get(SPARSE_FEATURE_NAMES[attribute]) is not None
+            or label_example.get(DENSE_FEATURE_NAMES[attribute]) is not None
+            for label_example in labels_example
+        )
 
     def _extract_features(
         self, message: Message, attribute: Text
@@ -380,9 +378,10 @@ def _extract_features(
                     f"don't coincide in '{message.text}' for attribute '{attribute}'."
                 )
 
-        # To speed up training take only the CLS token vector as feature if we don't
-        # use the transformer and we don't want to do entity recognition. We would
-        # not make use of the sequence anyway in this setup.  Carrying over
+        # If we don't use the transformer and we don't want to do entity recognition,
+        # to speed up training take only the sentence features as feature vector.
+        # It corresponds to the feature vector for the last token - CLS token.
+        # We would not make use of the sequence anyway in this setup. Carrying over
         # those features to the actual training process takes quite some time.
         if (
             self.component_config[NUM_TRANSFORMER_LAYERS] == 0
@@ -394,9 +393,9 @@ def _extract_features(
 
         return sparse_features, dense_features
 
-    def check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
-        """Checks if text features and label features have the same dimensionality if
-        hidden layers are shared."""
+    def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
+        """Checks if features have same dimensionality if hidden layers are shared."""
+
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
             num_text_features = model_data.feature_dimension("text_features")
             num_label_features = model_data.feature_dimension("label_features")
@@ -410,7 +409,7 @@ def check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
     def _extract_labels_precomputed_features(
         self, label_examples: List[Message], attribute: Text = INTENT
     ) -> List[np.ndarray]:
-        """Collect precomputed encodings"""
+        """Collects precomputed encodings."""
 
         sparse_features = []
         dense_features = []
@@ -431,7 +430,7 @@ def _extract_labels_precomputed_features(
     def _compute_default_label_features(
         labels_example: List[Message],
     ) -> List[np.ndarray]:
-        """Compute one-hot representation for the labels"""
+        """Computes one-hot representation for the labels."""
 
         return [
             np.array(
@@ -601,7 +600,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
 
         self.num_tags = len(self.inverted_tag_dict)
 
-        self.check_input_dimension_consistency(model_data)
+        self._check_input_dimension_consistency(model_data)
 
         return model_data
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 891f35d412ce..785997c118e3 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -254,7 +254,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
             training_data.intent_examples, label_id_dict, label_attribute=RESPONSE
         )
 
-        self.check_input_dimension_consistency(model_data)
+        self._check_input_dimension_consistency(model_data)
 
         return model_data
 
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 0d1490f40eee..8a085745cba0 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -10,6 +10,7 @@
 
 def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
     """Remove the BILOU prefix from the given tag."""
+
     if tag[:2] in BILOU_PREFIXES:
         return tag[0]
     return None
@@ -17,6 +18,7 @@ def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
 
 def entity_name_from_tag(tag: Text) -> Text:
     """Remove the BILOU prefix from the given tag."""
+
     if tag[:2] in BILOU_PREFIXES:
         return tag[2:]
     return tag
@@ -24,6 +26,7 @@ def entity_name_from_tag(tag: Text) -> Text:
 
 def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
     """Maps the entity tags of the message to the ids of the provided dict."""
+
     if message.get(BILOU_ENTITIES):
         _tags = [
             tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict["O"]
@@ -37,11 +40,13 @@ def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
 
 def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
     """Remove the BILOU prefixes from the given tags."""
+
     return [entity_name_from_tag(t) for t in tags]
 
 
 def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
     """Create a mapping of unique tags to ids."""
+
     distinct_tags = set(
         [
             entity_name_from_tag(e)
@@ -56,14 +61,16 @@ def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
         for idx_1, tag in enumerate(sorted(distinct_tags))
         for idx_2, prefix in enumerate(BILOU_PREFIXES)
     }
+    # "O" corresponds to non-entity which should correspond to 0 index
+    # needed for correct prediction for padding
     tag_id_dict["O"] = 0
 
     return tag_id_dict
 
 
 def apply_bilou_schema(training_data: TrainingData):
-    """Obtains a list of BILOU entity tags and sets them on the corresponding
-    message."""
+    """Gets a list of BILOU entity tags and sets them on the corresponding message."""
+
     for message in training_data.training_examples:
         entities = message.get(ENTITIES)
 

From 0ecff893df4dbc7122335997429a45800960b7ea Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 13:28:58 +0100
Subject: [PATCH 585/633] update choosing a pipeline

---
 data/configs_for_docs/default_config.yml      |  6 ++----
 .../configs_for_docs/default_spacy_config.yml |  5 ++++-
 docs/nlu/choosing-a-pipeline.rst              | 20 +++++++++++++------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
index b6591e42bc97..46b75c8078c7 100644
--- a/data/configs_for_docs/default_config.yml
+++ b/data/configs_for_docs/default_config.yml
@@ -1,9 +1,7 @@
 language: "en"
 
 pipeline:
-  - name: SpacyNLP
-  - name: SpacyTokenizer
-  - name: SpacyFeaturizer
+  - name: WhitespaceTokenizer
   - name: RegexFeaturizer
   - name: LexicalSyntacticFeaturizer
   - name: CountVectorsFeaturizer
@@ -13,4 +11,4 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
   - name: EntitySynonymMapper
-  - name: ResponseSelector
+  - name: ResponseSelector
\ No newline at end of file
diff --git a/data/configs_for_docs/default_spacy_config.yml b/data/configs_for_docs/default_spacy_config.yml
index 4834403621ee..360a2eef2c92 100644
--- a/data/configs_for_docs/default_spacy_config.yml
+++ b/data/configs_for_docs/default_spacy_config.yml
@@ -1,7 +1,9 @@
 language: "en"
 
 pipeline:
-  - name: WhitespaceTokenizer
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
   - name: RegexFeaturizer
   - name: LexicalSyntacticFeaturizer
   - name: CountVectorsFeaturizer
@@ -12,3 +14,4 @@ pipeline:
   - name: DIETClassifier
   - name: EntitySynonymMapper
   - name: ResponseSelector
+
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 71ffff1ef193..fbc9d7143f76 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -50,15 +50,23 @@ This is especially useful if you don’t have enough training data.
 The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence.
 However, ``ConveRT`` is only available in English.
-If your training data is not in English, we recommend using the following pipeline:
+If your training data is not in English, but you still want to use pre-trained word embeddings, we recommend using
+the following pipeline:
 
-.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+.. literalinclude:: ../../data/configs_for_docs/default_spacy_config.yml
     :language: yaml
 
 It uses the :ref:`SpacyFeaturizer` instead of the :ref:`ConveRTFeaturizer`.
 ``SpacyFeaturizer`` provides pre-trained word embeddings from either GloVe or fastText in many different languages
 (see :ref:`pretrained-word-vectors`).
 
+If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
+and can train your model to be more domain specific.
+If you don't want to use pre-trained word embeddings, we recommend using the following pipeline:
+
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+    :language: yaml
+
 .. note::
     We encourage everyone to define their own pipeline by listing the names of the components you want to use.
     You can find the details of each component in :ref:`components`.
@@ -92,10 +100,10 @@ You need to decide whether to use components that provide pre-trained word embed
 If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
 and can train your model to be more domain specific. For example, in general English, the word "balance" is closely
 related to "symmetry", but very different to the word "cash". In a banking domain, "balance" and "cash" are closely
-related and you'd like your model to capture that. If you don't
-use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language and domain.
-In those cases you should only use featurizers from the category :ref:`sparse featurizers <text-featurizers>`, such as
-``CountVectorsFeaturizer``, ``RegexFeaturizer`` or ``LexicalSyntacticFeaturizer``.
+related and you'd like your model to capture that.
+You should only use featurizers from the category :ref:`sparse featurizers <text-featurizers>`, such as
+``CountVectorsFeaturizer``, ``RegexFeaturizer`` or ``LexicalSyntacticFeaturizer``, if you don't want to use
+pre-trained word embeddings.
 
 The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:
 "I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model already knows that the

From 9ee5cf821ce68b46dd1d0c5d13304fa51f545768 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 13:36:06 +0100
Subject: [PATCH 586/633] add note for old terminology.

---
 docs/nlu/choosing-a-pipeline.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index fbc9d7143f76..aaceb420d9b0 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -13,6 +13,10 @@ it on your dataset.
 .. contents::
    :local:
 
+.. note::
+    With Rasa 1.8.0 we updated some components and deprecated all existing pipeline templates.
+    However, **any of the old terminology will still behave the same way as it did before**!
+
 .. warning::
     We deprecated all existing pipeline templates, e.g.
     :ref:`supervised_embeddings <section_supervised_embeddings_pipeline>`,

From 7a865ab6f7134d8bdaf50ca1f7976bbb5d61a3e1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 13:41:25 +0100
Subject: [PATCH 587/633] undo docker changes

---
 docker/Dockerfile_full                        |  2 +-
 ...ockerfile_pretrained_embeddings_convert_en | 83 -------------------
 .../config_pretrained_embeddings_convert.yml  | 15 ----
 ...etrained_embeddings_spacy_en_duckling.yml} |  4 +-
 4 files changed, 4 insertions(+), 100 deletions(-)
 delete mode 100644 docker/Dockerfile_pretrained_embeddings_convert_en
 delete mode 100644 docker/configs/config_pretrained_embeddings_convert.yml
 rename docker/configs/{config_supervised_embeddings_duckling.yml => config_pretrained_embeddings_spacy_en_duckling.yml} (83%)

diff --git a/docker/Dockerfile_full b/docker/Dockerfile_full
index c697018d783f..24d2147dc9d8 100644
--- a/docker/Dockerfile_full
+++ b/docker/Dockerfile_full
@@ -72,7 +72,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY docker/configs/config_supervised_embeddings_duckling.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_convert_en b/docker/Dockerfile_pretrained_embeddings_convert_en
deleted file mode 100644
index 6d58722df569..000000000000
--- a/docker/Dockerfile_pretrained_embeddings_convert_en
+++ /dev/null
@@ -1,83 +0,0 @@
-# Create common base stage
-FROM python:3.6-slim as base
-
-WORKDIR /build
-
-# Create virtualenv to isolate builds
-RUN python -m venv /build
-
-# Install common libraries
-RUN apt-get update -qq \
- && apt-get install -y --no-install-recommends \
-    # required by psycopg2 at build and runtime
-    libpq-dev \
-     # required for health check
-    curl \
- && apt-get autoremove -y
-
-# Make sure we use the virtualenv
-ENV PATH="/build/bin:$PATH"
-
-# Stage to build and install everything
-FROM base as builder
-
-WORKDIR /src
-
-# Install all required build libraries
-RUN apt-get update -qq \
- && apt-get install -y --no-install-recommends \
-    build-essential \
-    wget \
-    openssh-client \
-    graphviz-dev \
-    pkg-config \
-    git-core \
-    openssl \
-    libssl-dev \
-    libffi6 \
-    libffi-dev \
-    libpng-dev
-
-# Make sure we have the latest pip version
-RUN pip install -U pip
-
-# Copy only what we really need
-COPY README.md .
-COPY setup.py .
-COPY setup.cfg .
-COPY MANIFEST.in .
-COPY alt_requirements/ ./alt_requirements
-COPY requirements.txt .
-COPY LICENSE.txt .
-
-# Install dependencies
-RUN pip install --no-cache-dir -r alt_requirements/requirements_pretrained_embeddings_convert.txt
-
-# Install Rasa as package
-COPY rasa ./rasa
-RUN pip install .[sql,convert]
-
-# Runtime stage which uses the virtualenv which we built in the previous stage
-FROM base AS runner
-
-WORKDIR /app
-
-# Copy over default pipeline config
-COPY docker/configs/config_pretrained_embeddings_convert.yml config.yml
-
-# Copy virtualenv from previous stage
-COPY --from=builder /build /build
-
-# Create a volume for temporary data
-VOLUME /tmp
-
-# Make sure the default group has the same permissions as the owner
-RUN chgrp -R 0 . && chmod -R g=u .
-
-# Don't run as root
-USER 1001
-
-EXPOSE 5005
-
-ENTRYPOINT ["rasa"]
-CMD ["--help"]
diff --git a/docker/configs/config_pretrained_embeddings_convert.yml b/docker/configs/config_pretrained_embeddings_convert.yml
deleted file mode 100644
index ee0da9bfab1d..000000000000
--- a/docker/configs/config_pretrained_embeddings_convert.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-language: "en"
-
-pipeline:
-  - name: ConveRTTokenizer
-  - name: ConveRTFeaturizer
-  - name: RegexFeaturizer
-  - name: LexicalSyntacticFeaturizer
-  - name: CountVectorsFeaturizer
-  - name: CountVectorsFeaturizer
-    analyzer: "char_wb"
-    min_ngram: 1
-    max_ngram: 4
-  - name: DIETClassifier
-  - name: EntitySynonymMapper
-  - name: ResponseSelector
diff --git a/docker/configs/config_supervised_embeddings_duckling.yml b/docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml
similarity index 83%
rename from docker/configs/config_supervised_embeddings_duckling.yml
rename to docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml
index 7dbecba7acb9..cd314e4485be 100644
--- a/docker/configs/config_supervised_embeddings_duckling.yml
+++ b/docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml
@@ -1,7 +1,9 @@
 language: "en"
 
 pipeline:
-  - name: WhitespaceTokenizer
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
   - name: RegexFeaturizer
   - name: LexicalSyntacticFeaturizer
   - name: CountVectorsFeaturizer

From 9cc389defee06d1e12907a3acd891f88b15ad97c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 13:54:22 +0100
Subject: [PATCH 588/633] refactor data helpers

---
 rasa/core/policies/ted_policy.py        |  9 +++---
 rasa/nlu/classifiers/diet_classifier.py | 40 ++++++++++---------------
 rasa/nlu/selectors/response_selector.py |  7 +++--
 rasa/utils/tensorflow/model_data.py     |  8 ++++-
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index a6fda89d04e4..e80e0b75df47 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -305,10 +305,7 @@ def train(
             return
 
         # keep one example for persisting and loading
-        self.data_example = {
-            feature_name: [feature[:1] for feature in features]
-            for feature_name, features in model_data.items()
-        }
+        self.data_example = model_data.first_data_example()
 
         self.model = TED(
             model_data.get_signature(),
@@ -463,7 +460,9 @@ def __init__(
         self._check_data()
 
         self.predict_data_signature = {
-            k: vs for k, vs in data_signature.items() if "dialogue" in k
+            feature_name: features
+            for feature_name, features in data_signature.items()
+            if DIALOGUE in feature_name
         }
 
         # optimizer
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index fe118efc3be8..d399919f53c7 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -432,14 +432,9 @@ def _compute_default_label_features(
     ) -> List[np.ndarray]:
         """Computes one-hot representation for the labels."""
 
-        return [
-            np.array(
-                [
-                    np.expand_dims(a, 0)
-                    for a in np.eye(len(labels_example), dtype=np.float32)
-                ]
-            )
-        ]
+        eye_matrix = np.eye(len(labels_example), dtype=np.float32)
+        # add sequence dimension to one-hot labels
+        return [np.array([np.expand_dims(a, 0) for a in eye_matrix])]
 
     def _create_label_data(
         self,
@@ -488,14 +483,8 @@ def _create_label_data(
         return label_data
 
     def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
-        return [
-            np.array(
-                [
-                    self._label_data.get("label_features")[0][label_id]
-                    for label_id in label_ids
-                ]
-            )
-        ]
+        all_label_features = self._label_data.get("label_features")[0]
+        return [np.array([all_label_features[label_id] for label_id in label_ids])]
 
     def _create_model_data(
         self,
@@ -504,7 +493,7 @@ def _create_model_data(
         tag_id_dict: Optional[Dict[Text, int]] = None,
         label_attribute: Optional[Text] = None,
     ) -> RasaModelData:
-        """Prepare data for training and create a SessionDataType object"""
+        """Prepare data for training and create a RasaModelData object"""
 
         X_sparse = []
         X_dense = []
@@ -552,7 +541,7 @@ def _create_model_data(
         model_data = RasaModelData(label_key=self.label_key)
         model_data.add_features("text_features", [X_sparse, X_dense])
         model_data.add_features("label_features", [Y_sparse, Y_dense])
-        if label_attribute and model_data.does_feature_exist("label_features"):
+        if label_attribute and model_data.feature_not_exist("label_features"):
             # no label features are present, get default features from _label_data
             model_data.add_features(
                 "label_features", self._use_default_label_features(label_ids)
@@ -574,6 +563,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
 
         Performs sanity checks on training data, extracts encodings for labels.
         """
+
         if self.component_config[BILOU_FLAG]:
             bilou_utils.apply_bilou_schema(training_data)
 
@@ -637,7 +627,7 @@ def train(
                 return
 
         # keep one example for persisting and loading
-        self.data_example = {k: [v[:1] for v in vs] for k, vs in model_data.items()}
+        self.data_example = self.data_example = model_data.first_data_example()
 
         self.model = self.model_class()(
             data_signature=model_data.get_signature(),
@@ -736,11 +726,11 @@ def _predict_entities(
             tags = bilou_utils.remove_bilou_prefixes(tags)
 
         entities = self._convert_tags_to_entities(
-            message.text, message.get("tokens", []), tags
+            message.text, message.get(TOKENS_NAMES[TEXT], []), tags
         )
 
         extracted = self.add_extractor_name(entities)
-        entities = message.get("entities", []) + extracted
+        entities = message.get(ENTITIES, []) + extracted
 
         return entities
 
@@ -784,13 +774,13 @@ def process(self, message: Message, **kwargs: Any) -> None:
         if self.component_config[INTENT_CLASSIFICATION]:
             label, label_ranking = self._predict_label(out)
 
-            message.set("intent", label, add_to_output=True)
+            message.set(INTENT, label, add_to_output=True)
             message.set("intent_ranking", label_ranking, add_to_output=True)
 
         if self.component_config[ENTITY_RECOGNITION]:
             entities = self._predict_entities(out, message)
 
-            message.set("entities", entities, add_to_output=True)
+            message.set(ENTITIES, entities, add_to_output=True)
 
     def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         """Persist this model into the passed directory.
@@ -956,7 +946,9 @@ def __init__(
         self._check_data()
 
         self.predict_data_signature = {
-            k: vs for k, vs in data_signature.items() if "text" in k
+            feature_name: features
+            for feature_name, features in data_signature.items()
+            if TEXT in feature_name
         }
 
         label_batch = label_data.prepare_batch()
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 785997c118e3..4e6169586b0f 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -238,8 +238,11 @@ def _set_message_property(
         )
 
     def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
-        """Performs sanity checks on training data, extracts encodings for labels
-        and prepares data for training"""
+        """Prepares data for training.
+
+        Performs sanity checks on training data, extracts encodings for labels.
+        """
+
         if self.retrieval_intent:
             training_data = training_data.filter_by_intent(self.retrieval_intent)
 
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 2d55489a2c3c..4bac304b2b16 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -42,7 +42,13 @@ def values(self):
     def keys(self):
         return self.data.keys()
 
-    def does_feature_exist(self, key: Text) -> bool:
+    def first_data_example(self) -> Data:
+        return {
+            feature_name: [feature[:1] for feature in features]
+            for feature_name, features in self.data.items()
+        }
+
+    def feature_not_exist(self, key: Text) -> bool:
         """Check if feature key is present and features are available."""
         return key not in self.data or not self.data[key]
 

From 8cfa6e5c98c600f985ae181cdaed394be1e5fb97 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 14:10:10 +0100
Subject: [PATCH 589/633] substitute feature name strings with constants

---
 rasa/core/policies/ted_policy.py        | 50 ++++++++--------
 rasa/nlu/classifiers/diet_classifier.py | 80 ++++++++++++++-----------
 2 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e80e0b75df47..ed440af09a7d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -62,6 +62,9 @@
 
 logger = logging.getLogger(__name__)
 
+DIALOGUE_FEATURES = "dialogue_features"
+LABEL_FEATURES = "label_features"
+LABEL_IDS = "label_ids"
 
 SAVE_MODEL_FILE_NAME = "ted_policy"
 
@@ -219,27 +222,22 @@ def _label_ids_for_Y(data_Y: np.ndarray) -> np.ndarray:
     # noinspection PyPep8Naming
     def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
         """Prepare Y data for training: features for label_ids."""
+
+        all_label_features = self._label_data.get(LABEL_FEATURES)[0]
+
         is_full_dialogue_featurizer_used = len(label_ids.shape) == 2
         if is_full_dialogue_featurizer_used:
             return np.stack(
                 [
                     np.stack(
-                        [
-                            self._label_data.get("label_features")[0][label_idx]
-                            for label_idx in seq_label_ids
-                        ]
+                        [all_label_features[label_idx] for label_idx in seq_label_ids]
                     )
                     for seq_label_ids in label_ids
                 ]
             )
 
         # max history featurizer is used
-        return np.stack(
-            [
-                self._label_data.get("label_features")[0][label_idx]
-                for label_idx in label_ids
-            ]
-        )
+        return np.stack([all_label_features[label_idx] for label_idx in label_ids])
 
     # noinspection PyPep8Naming
     def _create_model_data(
@@ -257,10 +255,10 @@ def _create_model_data(
             # to track correctly dynamic sequences
             label_ids = np.expand_dims(label_ids, -1)
 
-        model_data = RasaModelData(label_key="label_ids")
-        model_data.add_features("dialogue_features", [data_X])
-        model_data.add_features("label_features", [Y])
-        model_data.add_features("label_ids", [label_ids])
+        model_data = RasaModelData(label_key=LABEL_IDS)
+        model_data.add_features(DIALOGUE_FEATURES, [data_X])
+        model_data.add_features(LABEL_FEATURES, [Y])
+        model_data.add_features(LABEL_IDS, [label_ids])
 
         return model_data
 
@@ -271,7 +269,7 @@ def _create_label_data(self, domain: Domain) -> RasaModelData:
         all_labels = all_labels.astype(np.float32)
 
         label_data = RasaModelData()
-        label_data.add_features("label_features", [all_labels])
+        label_data.add_features(LABEL_FEATURES, [all_labels])
         return label_data
 
     def train(
@@ -414,7 +412,7 @@ def load(cls, path: Text) -> "TEDPolicy":
             model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
         )
 
-        model_data_example = RasaModelData(label_key="label_ids", data=loaded_data)
+        model_data_example = RasaModelData(label_key=LABEL_IDS, data=loaded_data)
         meta = train_utils.update_similarity_type(meta)
 
         model = TED.load(
@@ -430,8 +428,12 @@ def load(cls, path: Text) -> "TEDPolicy":
 
         # build the graph for prediction
         predict_data_example = RasaModelData(
-            label_key="label_ids",
-            data={k: vs for k, vs in model_data_example.items() if "dialogue" in k},
+            label_key=LABEL_IDS,
+            data={
+                feature_name: features
+                for feature_name, features in model_data_example.items()
+                if DIALOGUE in feature_name
+            },
         )
         model.build_for_predict(predict_data_example)
 
@@ -485,12 +487,12 @@ def __init__(
         self._prepare_layers()
 
     def _check_data(self) -> None:
-        if "dialogue_features" not in self.data_signature:
+        if DIALOGUE_FEATURES not in self.data_signature:
             raise ValueError(
                 f"No text features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
-        if "label_features" not in self.data_signature:
+        if LABEL_FEATURES not in self.data_signature:
             raise ValueError(
                 f"No label features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
@@ -551,7 +553,7 @@ def _prepare_layers(self) -> None:
         )
 
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_labels = self.tf_label_data["label_features"][0]
+        all_labels = self.tf_label_data[LABEL_FEATURES][0]
         all_labels_embed = self._embed_label(all_labels)
 
         return all_labels, all_labels_embed
@@ -587,8 +589,8 @@ def batch_loss(
     ) -> tf.Tensor:
         batch = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        dialogue_in = batch["dialogue_features"][0]
-        label_in = batch["label_features"][0]
+        dialogue_in = batch[DIALOGUE_FEATURES][0]
+        label_in = batch[LABEL_FEATURES][0]
 
         if self.max_history_tracker_featurizer_used:
             # add time dimension if max history featurizer is used
@@ -613,7 +615,7 @@ def batch_predict(
     ) -> Dict[Text, tf.Tensor]:
         batch = self.batch_to_model_data_format(batch_in, self.predict_data_signature)
 
-        dialogue_in = batch["dialogue_features"][0]
+        dialogue_in = batch[DIALOGUE_FEATURES][0]
 
         if self.all_labels_embed is None:
             _, self.all_labels_embed = self._create_all_labels_embed()
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index d399919f53c7..f372a1bdb705 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -4,7 +4,6 @@
 import numpy as np
 import os
 import scipy.sparse
-import warnings
 import tensorflow as tf
 import tensorflow_addons as tfa
 
@@ -81,6 +80,13 @@
 
 logger = logging.getLogger(__name__)
 
+TEXT_FEATURES = "text_features"
+LABEL_FEATURES = "label_features"
+TEXT_MASK = "text_mask"
+LABEL_MASK = "label_mask"
+LABEL_IDS = "label_ids"
+TAG_IDS = "tag_ids"
+
 
 class DIETClassifier(IntentClassifier, EntityExtractor):
     """DIET (Dual Intent and Entity Transformer) is a multi-task architecture for
@@ -287,7 +293,7 @@ def __init__(
 
     @property
     def label_key(self) -> Optional[Text]:
-        return "label_ids" if self.component_config[INTENT_CLASSIFICATION] else None
+        return LABEL_IDS if self.component_config[INTENT_CLASSIFICATION] else None
 
     @staticmethod
     def model_class() -> Type[RasaModel]:
@@ -397,8 +403,8 @@ def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
         """Checks if features have same dimensionality if hidden layers are shared."""
 
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
-            num_text_features = model_data.feature_dimension("text_features")
-            num_label_features = model_data.feature_dimension("label_features")
+            num_text_features = model_data.feature_dimension(TEXT_FEATURES)
+            num_label_features = model_data.feature_dimension(LABEL_FEATURES)
 
             if num_text_features != num_label_features:
                 raise ValueError(
@@ -471,19 +477,19 @@ def _create_label_data(
             features = self._compute_default_label_features(labels_example)
 
         label_data = RasaModelData()
-        label_data.add_features("label_features", features)
+        label_data.add_features(LABEL_FEATURES, features)
 
         label_ids = np.array([idx for (idx, _) in labels_idx_example])
         # explicitly add last dimension to label_ids
         # to track correctly dynamic sequences
-        label_data.add_features("label_ids", [np.expand_dims(label_ids, -1)])
+        label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])
 
-        label_data.add_mask("label_mask", "label_features")
+        label_data.add_mask(LABEL_MASK, LABEL_FEATURES)
 
         return label_data
 
     def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
-        all_label_features = self._label_data.get("label_features")[0]
+        all_label_features = self._label_data.get(LABEL_FEATURES)[0]
         return [np.array([all_label_features[label_id] for label_id in label_ids])]
 
     def _create_model_data(
@@ -539,21 +545,21 @@ def _create_model_data(
         tag_ids = np.array(tag_ids)
 
         model_data = RasaModelData(label_key=self.label_key)
-        model_data.add_features("text_features", [X_sparse, X_dense])
-        model_data.add_features("label_features", [Y_sparse, Y_dense])
-        if label_attribute and model_data.feature_not_exist("label_features"):
+        model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense])
+        model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense])
+        if label_attribute and model_data.feature_not_exist(LABEL_FEATURES):
             # no label features are present, get default features from _label_data
             model_data.add_features(
-                "label_features", self._use_default_label_features(label_ids)
+                LABEL_FEATURES, self._use_default_label_features(label_ids)
             )
 
         # explicitly add last dimension to label_ids
         # to track correctly dynamic sequences
-        model_data.add_features("label_ids", [np.expand_dims(label_ids, -1)])
-        model_data.add_features("tag_ids", [tag_ids])
+        model_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])
+        model_data.add_features(TAG_IDS, [tag_ids])
 
-        model_data.add_mask("text_mask", "text_features")
-        model_data.add_mask("label_mask", "label_features")
+        model_data.add_mask(TEXT_MASK, TEXT_FEATURES)
+        model_data.add_mask(LABEL_MASK, LABEL_FEATURES)
 
         return model_data
 
@@ -596,7 +602,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
 
     @staticmethod
     def _check_enough_labels(model_data: RasaModelData) -> bool:
-        return len(np.unique(model_data.get("label_ids"))) >= 2
+        return len(np.unique(model_data.get(LABEL_IDS))) >= 2
 
     def train(
         self,
@@ -903,7 +909,7 @@ def _load_model(
         file_name = meta.get("file")
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
-        label_key = "label_ids" if meta[INTENT_CLASSIFICATION] else None
+        label_key = LABEL_IDS if meta[INTENT_CLASSIFICATION] else None
         model_data_example = RasaModelData(label_key=label_key, data=data_example)
 
         model = cls.model_class().load(
@@ -918,7 +924,11 @@ def _load_model(
         # build the graph for prediction
         predict_data_example = RasaModelData(
             label_key=label_key,
-            data={k: vs for k, vs in model_data_example.items() if "text" in k},
+            data={
+                feature_name: features
+                for feature_name, features in model_data_example.items()
+                if TEXT in feature_name
+            },
         )
 
         model.build_for_predict(predict_data_example)
@@ -969,28 +979,28 @@ def __init__(
         self.all_labels_embed = None  # needed for efficient prediction
 
     def _check_data(self) -> None:
-        if "text_features" not in self.data_signature:
+        if TEXT_FEATURES not in self.data_signature:
             raise ValueError(
                 f"No text features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
         if self.config[INTENT_CLASSIFICATION]:
-            if "label_features" not in self.data_signature:
+            if LABEL_FEATURES not in self.data_signature:
                 raise ValueError(
                     f"No label features specified. "
                     f"Cannot train '{self.__class__.__name__}' model."
                 )
             if (
                 self.config[SHARE_HIDDEN_LAYERS]
-                and self.data_signature["text_features"]
-                != self.data_signature["label_features"]
+                and self.data_signature[TEXT_FEATURES]
+                != self.data_signature[LABEL_FEATURES]
             ):
                 raise ValueError(
                     "If hidden layer weights are shared, data signatures "
                     "for text_features and label_features must coincide."
                 )
 
-        if self.config[ENTITY_RECOGNITION] and "tag_ids" not in self.data_signature:
+        if self.config[ENTITY_RECOGNITION] and TAG_IDS not in self.data_signature:
             raise ValueError(
                 f"No tag ids present. "
                 f"Cannot train '{self.__class__.__name__}' model."
@@ -1246,10 +1256,10 @@ def _create_sequence(
         return transformed, x, x_seq_ids, lm_mask_bool
 
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_label_ids = self.tf_label_data["label_ids"][0]
+        all_label_ids = self.tf_label_data[LABEL_IDS][0]
         x = self._create_bow(
-            self.tf_label_data["label_features"],
-            self.tf_label_data["label_mask"][0],
+            self.tf_label_data[LABEL_FEATURES],
+            self.tf_label_data[LABEL_MASK][0],
             self.label_name,
         )
         all_labels_embed = self._tf_layers["embed.label"](x)
@@ -1335,7 +1345,7 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        mask_text = tf_batch_data["text_mask"][0]
+        mask_text = tf_batch_data[TEXT_MASK][0]
         sequence_lengths = self._get_sequence_lengths(mask_text)
 
         (
@@ -1344,7 +1354,7 @@ def batch_loss(
             text_seq_ids,
             lm_mask_bool_text,
         ) = self._create_sequence(
-            tf_batch_data["text_features"],
+            tf_batch_data[TEXT_FEATURES],
             mask_text,
             self.text_name,
             self.config[MASKED_LM],
@@ -1365,10 +1375,10 @@ def batch_loss(
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
 
-            label_ids = tf_batch_data["label_ids"][0]
+            label_ids = tf_batch_data[LABEL_IDS][0]
             label = self._create_bow(
-                tf_batch_data["label_features"],
-                tf_batch_data["label_mask"][0],
+                tf_batch_data[LABEL_FEATURES],
+                tf_batch_data[LABEL_MASK][0],
                 self.label_name,
             )
             loss, acc = self._label_loss(cls, label, label_ids)
@@ -1377,7 +1387,7 @@ def batch_loss(
             losses.append(loss)
 
         if self.config[ENTITY_RECOGNITION]:
-            tag_ids = tf_batch_data["tag_ids"][0]
+            tag_ids = tf_batch_data[TAG_IDS][0]
 
             loss, f1 = self._entity_loss(
                 text_transformed, tag_ids, mask_text, sequence_lengths
@@ -1395,11 +1405,11 @@ def batch_predict(
             batch_in, self.predict_data_signature
         )
 
-        mask_text = tf_batch_data["text_mask"][0]
+        mask_text = tf_batch_data[TEXT_MASK][0]
         sequence_lengths = self._get_sequence_lengths(mask_text)
 
         text_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, self.text_name
+            tf_batch_data[TEXT_FEATURES], mask_text, self.text_name
         )
 
         out = {}

From 292134a50658ff9d33710312d445b35d5c6c0d8a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 14:53:50 +0100
Subject: [PATCH 590/633] refactor layers preparation

---
 rasa/core/policies/embedding_policy.py        |   3 +-
 rasa/core/policies/ted_policy.py              |  33 ++---
 rasa/nlu/classifiers/diet_classifier.py       | 119 ++++++++----------
 .../embedding_intent_classifier.py            |   3 +-
 rasa/nlu/selectors/response_selector.py       |  51 ++++----
 rasa/utils/tensorflow/constants.py            |   3 +
 rasa/utils/tensorflow/model_data.py           |   8 +-
 rasa/utils/tensorflow/models.py               |   5 +-
 8 files changed, 113 insertions(+), 112 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 644e49101d4f..c55901af6d56 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -36,6 +36,7 @@
     MAX_RELATIVE_POSITION,
     SOFTMAX,
     AUTO,
+    BALANCED,
 )
 from rasa.utils.tensorflow.models import RasaModel
 import rasa.utils.common as common_utils
@@ -87,7 +88,7 @@ class EmbeddingPolicy(TEDPolicy):
         BATCH_SIZES: [8, 32],
         # Strategy used when creating batches.
         # Can be either 'sequence' or 'balanced'.
-        BATCH_STRATEGY: "balanced",
+        BATCH_STRATEGY: BALANCED,
         # Number of epochs to train
         EPOCHS: 1,
         # Set random seed to any 'int' to get reproducible results
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index ed440af09a7d..4be295083ce3 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -57,14 +57,15 @@
     MAX_RELATIVE_POSITION,
     SOFTMAX,
     AUTO,
+    BALANCED,
 )
 
 
 logger = logging.getLogger(__name__)
 
-DIALOGUE_FEATURES = "dialogue_features"
-LABEL_FEATURES = "label_features"
-LABEL_IDS = "label_ids"
+DIALOGUE_FEATURES = f"{DIALOGUE}_features"
+LABEL_FEATURES = f"{LABEL}_features"
+LABEL_IDS = f"{LABEL}_ids"
 
 SAVE_MODEL_FILE_NAME = "ted_policy"
 
@@ -115,7 +116,7 @@ class TEDPolicy(Policy):
         BATCH_SIZES: [8, 32],
         # Strategy used whenc creating batches.
         # Can be either 'sequence' or 'balanced'.
-        BATCH_STRATEGY: "balanced",
+        BATCH_STRATEGY: BALANCED,
         # Number of epochs to train
         EPOCHS: 1,
         # Set random seed to any 'int' to get reproducible results
@@ -499,7 +500,7 @@ def _check_data(self) -> None:
             )
 
     def _prepare_layers(self) -> None:
-        self._tf_layers["loss.label"] = layers.DotProductLoss(
+        self._tf_layers[f"loss.{LABEL}"] = layers.DotProductLoss(
             self.config[NUM_NEG],
             self.config[LOSS_TYPE],
             self.config[MAX_POS_SIM],
@@ -510,14 +511,14 @@ def _prepare_layers(self) -> None:
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
         )
-        self._tf_layers["ffnn.dialogue"] = layers.Ffnn(
+        self._tf_layers[f"ffnn.{DIALOGUE}"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][DIALOGUE],
             self.config[DROP_RATE_DIALOGUE],
             self.config[REGULARIZATION_CONSTANT],
             self.config[WEIGHT_SPARSITY],
             layer_name_suffix=DIALOGUE,
         )
-        self._tf_layers["ffnn.label"] = layers.Ffnn(
+        self._tf_layers[f"ffnn.{LABEL}"] = layers.Ffnn(
             self.config[HIDDEN_LAYERS_SIZES][LABEL],
             self.config[DROP_RATE_LABEL],
             self.config[REGULARIZATION_CONSTANT],
@@ -539,13 +540,13 @@ def _prepare_layers(self) -> None:
             max_relative_position=self.config[MAX_RELATIVE_POSITION],
             name=DIALOGUE + "_encoder",
         )
-        self._tf_layers["embed.dialogue"] = layers.Embed(
+        self._tf_layers[f"embed.{DIALOGUE}"] = layers.Embed(
             self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             DIALOGUE,
             self.config[SIMILARITY_TYPE],
         )
-        self._tf_layers["embed.label"] = layers.Embed(
+        self._tf_layers[f"embed.{LABEL}"] = layers.Embed(
             self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             LABEL,
@@ -565,7 +566,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
         # if there is at least one `-1` it should be masked
         mask = tf.sign(tf.reduce_max(dialogue_in, -1) + 1)
 
-        dialogue = self._tf_layers["ffnn.dialogue"](dialogue_in, self._training)
+        dialogue = self._tf_layers[f"ffnn.{DIALOGUE}"](dialogue_in, self._training)
         dialogue_transformed = self._tf_layers["transformer"](
             dialogue, 1 - tf.expand_dims(mask, axis=-1), self._training
         )
@@ -576,13 +577,13 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
             dialogue_transformed = dialogue_transformed[:, -1:, :]
             mask = mask[:, -1:]
 
-        dialogue_embed = self._tf_layers["embed.dialogue"](dialogue_transformed)
+        dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
         return dialogue_embed, mask
 
     def _embed_label(self, label_in: Union[tf.Tensor, np.ndarray]) -> tf.Tensor:
-        label = self._tf_layers["ffnn.label"](label_in, self._training)
-        return self._tf_layers["embed.label"](label)
+        label = self._tf_layers[f"ffnn.{LABEL}"](label_in, self._training)
+        return self._tf_layers[f"embed.{LABEL}"](label)
 
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
@@ -601,7 +602,7 @@ def batch_loss(
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
         label_embed = self._embed_label(label_in)
 
-        loss, acc = self._tf_layers["loss.label"](
+        loss, acc = self._tf_layers[f"loss.{LABEL}"](
             dialogue_embed, label_embed, label_in, all_labels_embed, all_labels, mask
         )
 
@@ -622,13 +623,13 @@ def batch_predict(
 
         dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
 
-        sim_all = self._tf_layers["loss.label"].sim(
+        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
             dialogue_embed[:, :, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
             mask,
         )
 
-        scores = self._tf_layers["loss.label"].confidence_from_sim(
+        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
             sim_all, self.config[SIMILARITY_TYPE]
         )
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index f372a1bdb705..b77e7d56b56b 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -75,16 +75,17 @@
     MAX_RELATIVE_POSITION,
     SOFTMAX,
     AUTO,
+    BALANCED,
 )
 
 
 logger = logging.getLogger(__name__)
 
-TEXT_FEATURES = "text_features"
-LABEL_FEATURES = "label_features"
-TEXT_MASK = "text_mask"
-LABEL_MASK = "label_mask"
-LABEL_IDS = "label_ids"
+TEXT_FEATURES = f"{TEXT}_features"
+LABEL_FEATURES = f"{LABEL}_features"
+TEXT_MASK = f"{TEXT}_mask"
+LABEL_MASK = f"{LABEL}_mask"
+LABEL_IDS = f"{LABEL}_ids"
 TAG_IDS = "tag_ids"
 
 
@@ -135,7 +136,7 @@ def required_components(cls) -> List[Type[Component]]:
         BATCH_SIZES: [64, 256],
         # Strategy used when creating batches.
         # Can be either 'sequence' or 'balanced'.
-        BATCH_STRATEGY: "balanced",
+        BATCH_STRATEGY: BALANCED,
         # Number of epochs to train
         EPOCHS: 300,
         # Set random seed to any 'int' to get reproducible results
@@ -1007,12 +1008,12 @@ def _check_data(self) -> None:
             )
 
     def _create_metrics(self) -> None:
-        # self.metrics preserve order
-        # output losses first
+        # self.metrics will have the same order as they are created
+        # so create loss metrics first to output losses first
         self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
         self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
         self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
-        # output accuracies second
+        # create accuracy metrics second to output accuracies second
         self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
         self.response_acc = tf.keras.metrics.Mean(name="i_acc")
         self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
@@ -1083,11 +1084,32 @@ def _prepare_input_layers(self, name: Text) -> None:
             name,
         )
 
+    def _prepare_embed_layers(self, name: Text) -> None:
+        self._tf_layers[f"embed.{name}"] = layers.Embed(
+            self.config[EMBEDDING_DIMENSION],
+            self.config[REGULARIZATION_CONSTANT],
+            name,
+            self.config[SIMILARITY_TYPE],
+        )
+
+    def _prepare_dot_product_loss(self, name: Text) -> None:
+        self._tf_layers[f"loss.{name}"] = layers.DotProductLoss(
+            self.config[NUM_NEG],
+            self.config[LOSS_TYPE],
+            self.config[MAX_POS_SIM],
+            self.config[MAX_NEG_SIM],
+            self.config[USE_MAX_NEG_SIM],
+            self.config[NEGATIVE_MARGIN_SCALE],
+            self.config[SCALE_LOSS],
+            # set to 1 to get deterministic behaviour
+            parallel_iterations=1 if self.random_seed is not None else 1000,
+        )
+
     def _prepare_sequence_layers(self, name: Text) -> None:
         self._prepare_input_layers(name)
 
-        self._tf_layers[f"{name}_transformer"] = (
-            TransformerEncoder(
+        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
+            self._tf_layers[f"{name}_transformer"] = TransformerEncoder(
                 self.config[NUM_TRANSFORMER_LAYERS],
                 self.config[TRANSFORMER_SIZE],
                 self.config[NUM_HEADS],
@@ -1102,60 +1124,23 @@ def _prepare_sequence_layers(self, name: Text) -> None:
                 max_relative_position=self.config[MAX_RELATIVE_POSITION],
                 name=f"{name}_encoder",
             )
-            if self.config[NUM_TRANSFORMER_LAYERS] > 0
-            else lambda x, mask, training: x
-        )
+        else:
+            # create lambda so that it can be used later without the check
+            self._tf_layers[f"{name}_transformer"] = lambda x, mask, training: x
 
     def _prepare_mask_lm_layers(self, name: Text) -> None:
         self._tf_layers[f"{name}_input_mask"] = layers.InputMask()
-        self._tf_layers[f"embed.{name}_lm_mask"] = layers.Embed(
-            self.config[EMBEDDING_DIMENSION],
-            self.config[REGULARIZATION_CONSTANT],
-            f"{name}_lm_mask",
-            self.config[SIMILARITY_TYPE],
-        )
-        self._tf_layers[f"embed.{name}_golden_token"] = layers.Embed(
-            self.config[EMBEDDING_DIMENSION],
-            self.config[REGULARIZATION_CONSTANT],
-            f"{name}_golden_token",
-            self.config[SIMILARITY_TYPE],
-        )
-        self._tf_layers[f"loss.{name}_mask"] = layers.DotProductLoss(
-            self.config[NUM_NEG],
-            self.config[LOSS_TYPE],
-            self.config[MAX_POS_SIM],
-            self.config[MAX_NEG_SIM],
-            self.config[USE_MAX_NEG_SIM],
-            self.config[NEGATIVE_MARGIN_SCALE],
-            self.config[SCALE_LOSS],
-            # set to 1 to get deterministic behaviour
-            parallel_iterations=1 if self.random_seed is not None else 1000,
-        )
+
+        self._prepare_embed_layers(f"{name}_lm_mask")
+        self._prepare_embed_layers(f"{name}_golden_token")
+
+        self._prepare_dot_product_loss(f"{name}_mask")
 
     def _prepare_label_classification_layers(self) -> None:
-        self._tf_layers["embed.text"] = layers.Embed(
-            self.config[EMBEDDING_DIMENSION],
-            self.config[REGULARIZATION_CONSTANT],
-            "text",
-            self.config[SIMILARITY_TYPE],
-        )
-        self._tf_layers["embed.label"] = layers.Embed(
-            self.config[EMBEDDING_DIMENSION],
-            self.config[REGULARIZATION_CONSTANT],
-            "label",
-            self.config[SIMILARITY_TYPE],
-        )
-        self._tf_layers["loss.label"] = layers.DotProductLoss(
-            self.config[NUM_NEG],
-            self.config[LOSS_TYPE],
-            self.config[MAX_POS_SIM],
-            self.config[MAX_NEG_SIM],
-            self.config[USE_MAX_NEG_SIM],
-            self.config[NEGATIVE_MARGIN_SCALE],
-            self.config[SCALE_LOSS],
-            # set to 1 to get deterministic behaviour
-            parallel_iterations=1 if self.random_seed is not None else 1000,
-        )
+        self._prepare_embed_layers(TEXT)
+        self._prepare_embed_layers(LABEL)
+
+        self._prepare_dot_product_loss(LABEL)
 
     def _prepare_entity_recognition_layers(self) -> None:
         self._tf_layers["embed.logits"] = layers.Embed(
@@ -1262,7 +1247,7 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
             self.tf_label_data[LABEL_MASK][0],
             self.label_name,
         )
-        all_labels_embed = self._tf_layers["embed.label"](x)
+        all_labels_embed = self._tf_layers[f"embed.{LABEL}"](x)
 
         return all_label_ids, all_labels_embed
 
@@ -1304,10 +1289,10 @@ def _label_loss(
     ) -> tf.Tensor:
         all_label_ids, all_labels_embed = self._create_all_labels()
 
-        a_embed = self._tf_layers["embed.text"](a)
-        b_embed = self._tf_layers["embed.label"](b)
+        a_embed = self._tf_layers[f"embed.{TEXT}"](a)
+        b_embed = self._tf_layers[f"embed.{LABEL}"](b)
 
-        return self._tf_layers["loss.label"](
+        return self._tf_layers[f"loss.{LABEL}"](
             a_embed, b_embed, label_ids, all_labels_embed, all_label_ids
         )
 
@@ -1419,13 +1404,13 @@ def batch_predict(
 
             # get _cls_ vector for intent classification
             cls = self._last_token(text_transformed, sequence_lengths)
-            cls_embed = self._tf_layers["embed.text"](cls)
+            cls_embed = self._tf_layers[f"embed.{TEXT}"](cls)
 
             # pytype: disable=attribute-error
-            sim_all = self._tf_layers["loss.label"].sim(
+            sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
                 cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
             )
-            scores = self._tf_layers["loss.label"].confidence_from_sim(
+            scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
                 sim_all, self.config[SIMILARITY_TYPE]
             )
             # pytype: enable=attribute-error
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 5d8592797f5b..09fd96d13d03 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -38,6 +38,7 @@
     BILOU_FLAG,
     SOFTMAX,
     AUTO,
+    BALANCED,
 )
 import rasa.utils.common as common_utils
 from rasa.utils.tensorflow.models import RasaModel
@@ -77,7 +78,7 @@ def required_components(cls) -> List[Type[Component]]:
         BATCH_SIZES: [64, 256],
         # Strategy used when creating batches.
         # Can be either 'sequence' or 'balanced'.
-        BATCH_STRATEGY: "balanced",
+        BATCH_STRATEGY: BALANCED,
         # Number of epochs to train
         EPOCHS: 300,
         # Set random seed to any 'int' to get reproducible results
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 4e6169586b0f..91f279176a0a 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -8,7 +8,15 @@
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import Featurizer
-from rasa.nlu.classifiers.diet_classifier import DIETClassifier, DIET
+from rasa.nlu.classifiers.diet_classifier import (
+    DIETClassifier,
+    DIET,
+    TEXT_FEATURES,
+    LABEL_FEATURES,
+    TEXT_MASK,
+    LABEL_MASK,
+    LABEL_IDS,
+)
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -50,6 +58,7 @@
     RETRIEVAL_INTENT,
     SOFTMAX,
     AUTO,
+    BALANCED,
 )
 from rasa.nlu.constants import (
     RESPONSE,
@@ -115,7 +124,7 @@ def required_components(cls) -> List[Type[Component]]:
         BATCH_SIZES: [64, 256],
         # Strategy used when creating batches.
         # Can be either 'sequence' or 'balanced'.
-        BATCH_STRATEGY: "balanced",
+        BATCH_STRATEGY: BALANCED,
         # Number of epochs to train
         EPOCHS: 300,
         # Set random seed to any 'int' to get reproducible results
@@ -204,7 +213,7 @@ def __init__(
 
     @property
     def label_key(self) -> Text:
-        return "label_ids"
+        return LABEL_IDS
 
     @staticmethod
     def model_class() -> Type[RasaModel]:
@@ -284,20 +293,20 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
 class DIET2DIET(DIET):
     def _check_data(self) -> None:
-        if "text_features" not in self.data_signature:
+        if TEXT_FEATURES not in self.data_signature:
             raise ValueError(
                 f"No text features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
-        if "label_features" not in self.data_signature:
+        if LABEL_FEATURES not in self.data_signature:
             raise ValueError(
                 f"No label features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
         if (
             self.config[SHARE_HIDDEN_LAYERS]
-            and self.data_signature["text_features"]
-            != self.data_signature["label_features"]
+            and self.data_signature[TEXT_FEATURES]
+            != self.data_signature[LABEL_FEATURES]
         ):
             raise ValueError(
                 "If hidden layer weights are shared, data signatures "
@@ -330,17 +339,17 @@ def _prepare_layers(self) -> None:
         self._prepare_label_classification_layers()
 
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
-        all_label_ids = self.tf_label_data["label_ids"][0]
+        all_label_ids = self.tf_label_data[LABEL_IDS][0]
 
-        mask_label = self.tf_label_data["label_mask"][0]
+        mask_label = self.tf_label_data[LABEL_MASK][0]
         sequence_lengths_label = self._get_sequence_lengths(mask_label)
 
         label_transformed, _, _, _ = self._create_sequence(
-            self.tf_label_data["label_features"], mask_label, self.label_name
+            self.tf_label_data[LABEL_FEATURES], mask_label, self.label_name
         )
         cls_label = self._last_token(label_transformed, sequence_lengths_label)
 
-        all_labels_embed = self._tf_layers["embed.label"](cls_label)
+        all_labels_embed = self._tf_layers[f"embed.{LABEL}"](cls_label)
 
         return all_label_ids, all_labels_embed
 
@@ -349,7 +358,7 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        mask_text = tf_batch_data["text_mask"][0]
+        mask_text = tf_batch_data[TEXT_MASK][0]
         sequence_lengths_text = self._get_sequence_lengths(mask_text)
 
         (
@@ -358,18 +367,18 @@ def batch_loss(
             text_seq_ids,
             lm_mask_bool_text,
         ) = self._create_sequence(
-            tf_batch_data["text_features"],
+            tf_batch_data[TEXT_FEATURES],
             mask_text,
             self.text_name,
             self.config[MASKED_LM],
             sequence_ids=True,
         )
 
-        mask_label = tf_batch_data["label_mask"][0]
+        mask_label = tf_batch_data[LABEL_MASK][0]
         sequence_lengths_label = self._get_sequence_lengths(mask_label)
 
         label_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["label_features"], mask_label, self.label_name
+            tf_batch_data[LABEL_FEATURES], mask_label, self.label_name
         )
 
         losses = []
@@ -390,7 +399,7 @@ def batch_loss(
         # get _cls_ vector for label classification
         cls_text = self._last_token(text_transformed, sequence_lengths_text)
         cls_label = self._last_token(label_transformed, sequence_lengths_label)
-        label_ids = tf_batch_data["label_ids"][0]
+        label_ids = tf_batch_data[LABEL_IDS][0]
 
         loss, acc = self._label_loss(cls_text, cls_label, label_ids)
         self.response_loss.update_state(loss)
@@ -406,11 +415,11 @@ def batch_predict(
             batch_in, self.predict_data_signature
         )
 
-        mask_text = tf_batch_data["text_mask"][0]
+        mask_text = tf_batch_data[TEXT_MASK][0]
         sequence_lengths_text = self._get_sequence_lengths(mask_text)
 
         text_transformed, _, _, _ = self._create_sequence(
-            tf_batch_data["text_features"], mask_text, self.text_name
+            tf_batch_data[TEXT_FEATURES], mask_text, self.text_name
         )
 
         out = {}
@@ -420,12 +429,12 @@ def batch_predict(
 
         # get _cls_ vector for intent classification
         cls = self._last_token(text_transformed, sequence_lengths_text)
-        cls_embed = self._tf_layers["embed.text"](cls)
+        cls_embed = self._tf_layers[f"embed.{TEXT}"](cls)
 
-        sim_all = self._tf_layers["loss.label"].sim(
+        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
             cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
         )
-        scores = self._tf_layers["loss.label"].confidence_from_sim(
+        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
             sim_all, self.config[SIMILARITY_TYPE]
         )
         out["i_scores"] = scores
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 0870f4655235..31a41dc1009a 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -58,3 +58,6 @@
 AUTO = "auto"
 INNER = "inner"
 COSINE = "cosine"
+
+BALANCED = "balanced"
+SEQUENCE = "sequence"
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 4bac304b2b16..7e4da6ccab54 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -7,7 +7,7 @@
 from sklearn.model_selection import train_test_split
 from typing import Optional, Dict, Text, List, Tuple, Any, Union, Generator, NamedTuple
 from collections import defaultdict
-
+from rasa.utils.tensorflow.constants import BALANCED, SEQUENCE
 
 logger = logging.getLogger(__name__)
 
@@ -312,7 +312,7 @@ def batch_tuple_sizes(self) -> Dict[Text, int]:
         return tuple_sizes
 
     def as_tf_dataset(
-        self, batch_size: int, batch_strategy: Text = "sequence", shuffle: bool = False
+        self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
     ) -> tf.data.Dataset:
         """Create tf dataset."""
 
@@ -361,7 +361,7 @@ def append_type(features: np.ndarray):
         return tuple(shapes), tuple(types)
 
     def _gen_batch(
-        self, batch_size: int, batch_strategy: Text = "sequence", shuffle: bool = False
+        self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
     ) -> Generator[Tuple, None, None]:
         """Generate batches."""
 
@@ -370,7 +370,7 @@ def _gen_batch(
         if shuffle:
             data = self.shuffled_data(data)
 
-        if batch_strategy == "balanced":
+        if batch_strategy == BALANCED:
             data = self.balanced_data(data, batch_size, shuffle)
 
         num_batches = self.num_examples // batch_size + int(
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 41f80571dbc9..d723278d288a 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 from rasa.utils.common import is_logging_disabled
 from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.constants import SEQUENCE
 
 logger = logging.getLogger(__name__)
 
@@ -155,7 +156,7 @@ def load(
             batch_size=1,
             evaluate_every_num_epochs=0,
             evaluate_on_num_examples=0,
-            batch_strategy="sequence",
+            batch_strategy=SEQUENCE,
             silent=True,  # don't confuse users with training output
             eager=True,  # no need to build tf graph, eager is faster here
         )
@@ -241,7 +242,7 @@ def _get_tf_evaluation_functions(
 
         def evaluation_dataset_function(_batch_size: int) -> tf.data.Dataset:
             return evaluation_model_data.as_tf_dataset(
-                _batch_size, "sequence", shuffle=False
+                _batch_size, SEQUENCE, shuffle=False
             )
 
         self._training = False  # needed for tf graph mode

From 3057c1e73cc2b546a8bf9706e389f6a888077f4f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 14:58:26 +0100
Subject: [PATCH 591/633] update components.rst

---
 docs/nlu/components.rst | 169 ++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 83 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 62e90514dfb1..6dcdd20abaab 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -281,7 +281,7 @@ LanguageModelTokenizer
 :Requires: :ref:`HFTransformersNLP`
 :Description:
     Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
-    Must be used whenever the ``LanguageModelFeaturizer`` is used.
+    Must be used whenever the :ref:`LanguageModelFeaturizer` is used.
 :Configuration:
 
     .. code-block:: yaml
@@ -306,7 +306,7 @@ As those feature vectors would normally take up a lot of memory, we store them a
 Sparse features only store the values that are non zero and their positions in the vector.
 Thus, we save a lot of memory and are able to train on larger datasets.
 
-By default all featurizers will return a matrix of length (number-of-tokens x feature-dimension).
+By default all featurizers will return a matrix of length ``(number-of-tokens x feature-dimension)``.
 So, the returned matrix will have a feature vector for every token.
 This allows us to train sequence models.
 However, the additional token at the end (e.g. ``__CLS__``) contains features for the complete utterance.
@@ -390,12 +390,12 @@ ConveRTFeaturizer
     Uses the `default signature <https://github.com/PolyAI-LDN/polyai-models#tfhub-signatures>`_ to compute vector
     representations of input text.
 
-    .. warning::
-        Since ``ConveRT`` model is trained only on an english corpus of conversations, this featurizer should only
-        be used if your training data is in english language.
+    .. note::
+        Since ``ConveRT`` model is trained only on an English corpus of conversations, this featurizer should only
+        be used if your training data is in English language.
 
     .. note::
-        To use ``ConveRTFeaturizer`` you need to install additional tensorflow libraries (``tensorflow_text`` and
+        To use ``ConveRTFeaturizer`` you need to install additional TenorFlow libraries (``tensorflow_text`` and
         ``tensorflow_hub``). You should do a pip install of Rasa with ``pip install rasa[convert]`` to install those.
 
 :Configuration:
@@ -421,13 +421,13 @@ LanguageModelFeaturizer
     Uses the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component to compute vector
     representations of input text.
 
-    .. warning::
+    .. note::
         Please make sure that you use a language model which is pre-trained on the same language corpus as that of your
         training data.
 
 :Configuration:
 
-    Include ``HFTransformersNLP`` and :ref:`LanguageModelTokenizer` components before this component. Use
+    Include :ref:`HFTransformersNLP` and :ref:`LanguageModelTokenizer` components before this component. Use
     :ref:`LanguageModelTokenizer` to ensure tokens are correctly set for all components throughout the pipeline.
 
     .. code-block:: yaml
@@ -450,8 +450,8 @@ RegexFeaturizer
     For each regex, a feature will be set marking whether this expression was found in the input, which will later
     be fed into intent classifier / entity extractor to simplify classification (assuming the classifier has learned
     during the training phase, that this set feature indicates a certain intent / entity).
-    Regex features for entity extraction are currently only supported by the ``CRFEntityExtractor`` and the
-    ``DIETClassifier`` components!
+    Regex features for entity extraction are currently only supported by the :ref:`CRFEntityExtractor` and the
+    :ref:`diet-classifier` components!
 
 :Configuration:
 
@@ -460,6 +460,8 @@ RegexFeaturizer
         pipeline:
         - name: "RegexFeaturizer"
 
+.. _CountVectorsFeaturizer:
+
 CountVectorsFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -473,11 +475,6 @@ CountVectorsFeaturizer
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
     All tokens which consist only of digits (e.g. 123 and 99 but not a123d) will be assigned to the same feature.
 
-    .. note::
-        If the words in the model language cannot be split by whitespace,
-        a language-specific tokenizer is required in the pipeline before this component
-        (e.g. using ``JiebaTokenizer`` for Chinese).
-
 :Configuration:
     See `sklearn's CountVectorizer docs <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
     for detailed description of the configuration parameters.
@@ -489,11 +486,11 @@ CountVectorsFeaturizer
     .. note::
         Option ‘char_wb’ creates character n-grams only from text inside word boundaries;
         n-grams at the edges of words are padded with space.
-        This option can be used to create `Subword Semantic Hashing <https://arxiv.org/abs/1810.07150>`_
+        This option can be used to create `Subword Semantic Hashing <https://arxiv.org/abs/1810.07150>`_.
 
     .. note::
         For character n-grams do not forget to increase ``min_ngram`` and ``max_ngram`` parameters.
-        Otherwise the vocabulary will contain only single letters
+        Otherwise the vocabulary will contain only single letters.
 
     Handling Out-Of-Vacabulary (OOV) words:
 
@@ -585,28 +582,31 @@ LexicalSyntacticFeaturizer
     Moves with a sliding window over every token in the user message and creates features according to the
     configuration (see below). As a default configuration is present, you don't need to specify a configuration.
 :Configuration:
-    You need to configure what kind of lexical and syntactic features the featurizer should extract.
+    You can configure what kind of lexical and syntactic features the featurizer should extract.
     The following features are available:
 
-    ==============  =============================================================================================
-    Feature Name    Description
-    ==============  =============================================================================================
-    BOS             Checks if the token is at the beginning of the sentence.
-    EOS             Checks if the token is at the end of the sentence.
-    low             Checks if the token is lower case.
-    upper           Checks if the token is upper case.
-    title           Checks if the token starts with an uppercase character and all remaining characters are
-                    lowercased.
-    digit           Checks if the token contains just digits.
-    prefix5         Take the first five characters of the token.
-    prefix2         Take the first two characters of the token.
-    suffix5         Take the last five characters of the token.
-    suffix3         Take the last three characters of the token.
-    suffix2         Take the last two characters of the token.
-    suffix1         Take the last character of the token.
-    pos             Take the Part-of-Speech tag of the token (spaCy required).
-    pos2            Take the first two characters of the Part-of-Speech tag of the token (spaCy required).
-    ==============  =============================================================================================
+    .. code-block:: yaml
+
+        # ==============  ==========================================================================================
+        # Feature Name    Description
+        # ==============  ==========================================================================================
+        # BOS             Checks if the token is at the beginning of the sentence.
+        # EOS             Checks if the token is at the end of the sentence.
+        # low             Checks if the token is lower case.
+        # upper           Checks if the token is upper case.
+        # title           Checks if the token starts with an uppercase character and all remaining characters are
+        #                 lowercased.
+        # digit           Checks if the token contains just digits.
+        # prefix5         Take the first five characters of the token.
+        # prefix2         Take the first two characters of the token.
+        # suffix5         Take the last five characters of the token.
+        # suffix3         Take the last three characters of the token.
+        # suffix2         Take the last two characters of the token.
+        # suffix1         Take the last character of the token.
+        # pos             Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
+        # pos2            Take the first two characters of the Part-of-Speech tag of the token
+        #                 (``SpacyTokenizer`` required).
+        # ==============  ==========================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
     previous tokens, the current token, and the next tokens in the sliding window.
@@ -624,11 +624,6 @@ LexicalSyntacticFeaturizer
               "BOS",
               "EOS",
               "low",
-              "prefix5",
-              "prefix2",
-              "suffix5",
-              "suffix3",
-              "suffix2",
               "upper",
               "title",
               "digit",
@@ -677,6 +672,10 @@ MitieIntentClassifier
 SklearnIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~
 
+.. warning::
+    ``SklearnIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
+    :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+
 :Short: Sklearn intent classifier
 :Outputs: ``intent`` and ``intent_ranking``
 :Requires: ``dense_features`` for user messages
@@ -703,10 +702,6 @@ SklearnIntentClassifier
     rankings of the labels that did not "win". The ``SklearnIntentClassifier`` needs to be preceded by a dense
     featurizer in the pipeline. This dense featurizer creates the features used for the classification.
 
-    .. warning::
-        ``SklearnIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
-        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
-
 :Configuration:
     During the training of the SVM a hyperparameter search is run to
     find the best parameter set. In the config, you can specify the parameters
@@ -729,6 +724,10 @@ SklearnIntentClassifier
 EmbeddingIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. warning::
+    ``EmbeddingIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
+    :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+
 :Short: Embedding intent classifier for intent classification
 :Outputs: ``intent`` and ``intent_ranking``
 :Requires: ``dense_features`` and/or ``sparse_features`` for user messages, and optionally the intent
@@ -758,13 +757,11 @@ EmbeddingIntentClassifier
     additional hidden layers are added together with dropout.
     This algorithm also provides similarity rankings of the labels that did not "win".
 
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty intent ``None`` is predicted with confidence ``0.0``.
-
-    .. warning::
-        ``EmbeddingIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
-        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+    .. note:: If during prediction time a message contains **only** words unseen during training
+              and no Out-Of-Vacabulary preprocessor was used, an empty intent ``None`` is predicted with confidence
+              ``0.0``. This might happen if you only use the :ref:`CountVectorsFeaturizer` with a ``word`` analyzer
+              as featurizer. If you use the ``char_wb`` analyzer, you should always get an intent with a confidence
+              value ``> 0.0``.
 
 :Configuration:
 
@@ -948,7 +945,7 @@ KeywordIntentClassifier
     This means the entire example is the keyword, not the individual words in the example.
 
     .. note:: This classifier is intended only for small projects or to get started. If
-              you have few NLU training data you can use one of our pipelines
+              you have few NLU training data, you can take a look at the recommended pipelines in
               :ref:`choosing-a-pipeline`.
 
 :Configuration:
@@ -992,15 +989,17 @@ ResponseSelector
     Response Selector component can be used to build a response retrieval model to directly predict a bot response from
     a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
     It embeds user inputs and response labels into the same space and follows the exact same
-    neural network architecture and optimization as the ``DIETClassifier``.
+    neural network architecture and optimization as the :ref:`diet-classifier`.
 
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty response ``None`` is predicted with confidence ``0.0``.
+    .. note:: If during prediction time a message contains **only** words unseen during training
+              and no Out-Of-Vacabulary preprocessor was used, an empty intent ``None`` is predicted with confidence
+              ``0.0``. This might happen if you only use the :ref:`CountVectorsFeaturizer` with a ``word`` analyzer
+              as featurizer. If you use the ``char_wb`` analyzer, you should always get an intent with a confidence
+              value ``> 0.0``.
 
 :Configuration:
 
-    The algorithm includes all the hyperparameters that ``DIETClassifier`` uses.
+    The algorithm includes all the hyperparameters that :ref:`diet-classifier` uses.
     In addition, the component can also be configured to train a response selector for a particular retrieval intent.
 
         - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
@@ -1106,7 +1105,7 @@ ResponseSelector
 Entity Extractors
 -----------------
 
-Entity extractors extract entities, such as person names or locations, from the user input.
+Entity extractors extract entities, such as person names or locations, from the user message.
 
 MitieEntityExtractor
 ~~~~~~~~~~~~~~~~~~~~
@@ -1242,7 +1241,7 @@ CRFEntityExtractor
 
         {
             "entities": [{
-                "value":"New York City",
+                "value": "New York City",
                 "start": 20,
                 "end": 33,
                 "entity": "city",
@@ -1273,26 +1272,28 @@ CRFEntityExtractor
     However, you can overwrite the default configuration.
     The following features are available:
 
-    ===============  =============================================================================
-    Feature Name     Description
-    ===============  =============================================================================
-    low              Checks if the token is lower case.
-    upper            Checks if the token is upper case.
-    title            Checks if the token starts with an uppercase character and all remaining
-                     characters are lowercased.
-    digit            Checks if the token contains just digits.
-    prefix5          Take the first five characters of the token.
-    prefix2          Take the first two characters of the token.
-    suffix5          Take the last five characters of the token.
-    suffix3          Take the last three characters of the token.
-    suffix2          Take the last two characters of the token.
-    suffix1          Take the last character of the token.
-    pos              Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
-    pos2             Take the first two characters of the Part-of-Speech tag of the token
-                     (``SpacyTokenizer`` required).
-    pattern          Take the patterns defined by ``RegexFeaturizer``.
-    bias             Add an additional "bias" feature to the list of features.
-    ===============  =============================================================================
+    .. code-block:: yaml
+
+        # ==============  ==========================================================================================
+        # Feature Name    Description
+        # ==============  ==========================================================================================
+        # low             Checks if the token is lower case.
+        # upper           Checks if the token is upper case.
+        # title           Checks if the token starts with an uppercase character and all remaining characters are
+        #                 lowercased.
+        # digit           Checks if the token contains just digits.
+        # prefix5         Take the first five characters of the token.
+        # prefix2         Take the first two characters of the token.
+        # suffix5         Take the last five characters of the token.
+        # suffix3         Take the last three characters of the token.
+        # suffix2         Take the last two characters of the token.
+        # suffix1         Take the last character of the token.
+        # pos             Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
+        # pos2            Take the first two characters of the Part-of-Speech tag of the token
+        #                 (``SpacyTokenizer`` required).
+        # pattern         Take the patterns defined by ``RegexFeaturizer``.
+        # bias            Add an additional "bias" feature to the list of features.
+        # ==============  ==========================================================================================
 
     As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
     previous tokens, the current token, and the next tokens in the sliding window.
@@ -1454,8 +1455,10 @@ DIETClassifier
     similarities with negative samples.
 
     .. note:: If during prediction time a message contains **only** words unseen during training
-              and no Out-Of-Vacabulary preprocessor was used,
-              an empty intent ``None`` is predicted with confidence ``0.0``.
+              and no Out-Of-Vacabulary preprocessor was used, an empty intent ``None`` is predicted with confidence
+              ``0.0``. This might happen if you only use the :ref:`CountVectorsFeaturizer` with a ``word`` analyzer
+              as featurizer. If you use the ``char_wb`` analyzer, you should always get an intent with a confidence
+              value ``> 0.0``.
 
 :Configuration:
 

From 85ff063cfe8cfcc832b0036f5c722554d64dc082 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 15:08:33 +0100
Subject: [PATCH 592/633] update choosing a pipeline

---
 docs/nlu/choosing-a-pipeline.rst | 25 ++++++++++++-------------
 docs/nlu/components.rst          |  4 ++++
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index aaceb420d9b0..eb143a16bc7d 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -51,7 +51,7 @@ Pre-trained word embeddings are helpful as they already encode some kind of ling
 For example, if you have a sentence like "I want to buy apples" in your training data, and Rasa is asked to predict
 the intent for "get pears", your model already knows that the words "apples" and "pears" are very similar.
 This is especially useful if you don’t have enough training data.
-The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
+The advantage of the :ref:`ConveRTFeaturizer` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence.
 However, ``ConveRT`` is only available in English.
 If your training data is not in English, but you still want to use pre-trained word embeddings, we recommend using
@@ -61,7 +61,7 @@ the following pipeline:
     :language: yaml
 
 It uses the :ref:`SpacyFeaturizer` instead of the :ref:`ConveRTFeaturizer`.
-``SpacyFeaturizer`` provides pre-trained word embeddings from either GloVe or fastText in many different languages
+:ref:`SpacyFeaturizer` provides pre-trained word embeddings from either GloVe or fastText in many different languages
 (see :ref:`pretrained-word-vectors`).
 
 If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
@@ -88,7 +88,7 @@ A pipeline usually consist of three main parts:
 Tokenization
 ~~~~~~~~~~~~
 If your chosen language is whitespace-tokenized (words are separated by spaces), you
-can use the ``WhitespaceTokenizer``. If this is not the case you should use a different tokenizer.
+can use the :ref:`WhitespaceTokenizer`. If this is not the case you should use a different tokenizer.
 We support a number of different :ref:`tokenizers <tokenizers>`, or you can
 create your own :ref:`custom tokenizer <custom-nlu-components>`.
 
@@ -106,7 +106,7 @@ and can train your model to be more domain specific. For example, in general Eng
 related to "symmetry", but very different to the word "cash". In a banking domain, "balance" and "cash" are closely
 related and you'd like your model to capture that.
 You should only use featurizers from the category :ref:`sparse featurizers <text-featurizers>`, such as
-``CountVectorsFeaturizer``, ``RegexFeaturizer`` or ``LexicalSyntacticFeaturizer``, if you don't want to use
+:ref:`CountVectorsFeaturizer`, :ref:`RegexFeaturizer` or :ref:`LexicalSyntacticFeaturizer`, if you don't want to use
 pre-trained word embeddings.
 
 The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:
@@ -119,15 +119,15 @@ We support a few components that provide pre-trained word embeddings:
 3. :ref:`ConveRTFeaturizer`
 4. :ref:`LanguageModelFeaturizer`
 
-If your training data is in English, we recommend using the ``ConveRTFeaturizer``.
-The advantage of the ``ConveRTFeaturizer`` is that it doesn't treat each word of the user message independently, but
+If your training data is in English, we recommend using the :ref:`ConveRTFeaturizer`.
+The advantage of the :ref:`ConveRTFeaturizer` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence. For example, if you
-have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
+have a training example, like: "Can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
 my place", since the contextual vector representation for both examples are already very similar, the intent classified
 for both is highly likely to be the same. This is also useful if you don't have enough training data.
 
-An alternative to ``ConveRTFeaturizer`` is the ``LanguageModelFeaturizer`` which uses pre-trained language models such
-as BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See
+An alternative to :ref:`ConveRTFeaturizer` is the :ref:`LanguageModelFeaturizer` which uses pre-trained language
+models such as BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See
 :ref:`HFTransformersNLP` for a full list of supported language models.
 
 If your training data is not in English you can also use a different variant of a language model which
@@ -135,17 +135,16 @@ is pre-trained in the language specific to your training data.
 For example, there are chinese (``bert-base-chinese``) and japanese (``bert-base-japanese``) variants of the BERT model.
 A full list of different variants of
 these language models is available in the
-`official documentation of the transformers library <https://huggingface.co/transformers/pretrained_models.html>`_.
+`official documentation of the Transformers library <https://huggingface.co/transformers/pretrained_models.html>`_.
 
-``SpacyFeaturizer`` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`),
+:ref:`SpacyFeaturizer` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`),
 so you can use this as another alternative, depending on the language of your training data.
-So, this featurizer can also be an alternate option depending on the language of your training data.
 
 Entity Recognition / Intent Classification / Response Selectors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Depending on your data you may want to only perform intent classification, entity recognition or response selection.
 Or you might want to combine multiple of those tasks.
-We support several components for each of the task. All of them are listed in :ref:`components`.
+We support several components for each of the tasks. All of them are listed in :ref:`components`.
 We recommend using :ref:`diet-classifier` for intent classification and entity recognition
 and :ref:`response-selector` for response selection.
 
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 6dcdd20abaab..aa9a1d3033fb 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -156,6 +156,8 @@ modeling hierarchical intent structure, use these flags with any tokenizer:
     .. note:: All tokenizer add an additional token ``__CLS__`` to the end of the list of tokens when tokenizing
               text and responses.
 
+.. _WhitespaceTokenizer:
+
 WhitespaceTokenizer
 ~~~~~~~~~~~~~~~~~~~
 
@@ -436,6 +438,8 @@ LanguageModelFeaturizer
         - name: "LanguageModelFeaturizer"
 
 
+.. _RegexFeaturizer:
+
 RegexFeaturizer
 ~~~~~~~~~~~~~~~
 

From d92f689c2c67d7c4160710b5750c50fa778e0ab3 Mon Sep 17 00:00:00 2001
From: akelad <akela@rasa.com>
Date: Tue, 25 Feb 2020 15:24:48 +0100
Subject: [PATCH 593/633] quick fix for docs typos/formatting

---
 docs/nlu/choosing-a-pipeline.rst | 4 +++-
 docs/nlu/components.rst          | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index eb143a16bc7d..04d94966da81 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -54,6 +54,7 @@ This is especially useful if you don’t have enough training data.
 The advantage of the :ref:`ConveRTFeaturizer` is that it doesn't treat each word of the user message independently, but
 creates a contextual vector representation for the complete sentence.
 However, ``ConveRT`` is only available in English.
+
 If your training data is not in English, but you still want to use pre-trained word embeddings, we recommend using
 the following pipeline:
 
@@ -66,7 +67,8 @@ It uses the :ref:`SpacyFeaturizer` instead of the :ref:`ConveRTFeaturizer`.
 
 If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
 and can train your model to be more domain specific.
-If you don't want to use pre-trained word embeddings, we recommend using the following pipeline:
+If there are no word embeddings for your language or you have very domain specific terminology,
+we recommend using the following pipeline:
 
 .. literalinclude:: ../../data/configs_for_docs/default_config.yml
     :language: yaml
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index aa9a1d3033fb..adc8a98c548d 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -397,7 +397,7 @@ ConveRTFeaturizer
         be used if your training data is in English language.
 
     .. note::
-        To use ``ConveRTFeaturizer`` you need to install additional TenorFlow libraries (``tensorflow_text`` and
+        To use ``ConveRTFeaturizer`` you need to install additional TensorFlow libraries (``tensorflow_text`` and
         ``tensorflow_hub``). You should do a pip install of Rasa with ``pip install rasa[convert]`` to install those.
 
 :Configuration:

From 830c66ddcaf9193cbf1be1534b8ae718cfe4be40 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 15:32:17 +0100
Subject: [PATCH 594/633] use migration guide constant

---
 rasa/core/policies/embedding_policy.py              |  3 ++-
 rasa/nlu/classifiers/embedding_intent_classifier.py |  3 ++-
 rasa/nlu/classifiers/sklearn_intent_classifier.py   |  4 ++--
 rasa/nlu/components.py                              |  5 +++--
 rasa/nlu/config.py                                  | 10 +++++++---
 rasa/nlu/extractors/crf_entity_extractor.py         |  9 ++++++---
 6 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index c55901af6d56..b3ad427db0ca 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Any, Dict, Optional, Text
 
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.ted_policy import TEDPolicy
@@ -155,5 +156,5 @@ def __init__(
             f"'EmbeddingPolicy' is deprecated and will be removed in version 2.0. "
             f"Use 'TEDPolicy' instead.",
             category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 09fd96d13d03..8135be00cef7 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Any, Dict, Optional, Text, List, Type
 
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
 from rasa.nlu.featurizers.featurizer import Featurizer
 from rasa.nlu.components import Component
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -163,5 +164,5 @@ def __init__(
             "'EmbeddingIntentClassifier' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index efa158f65441..47bd8940e907 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -7,7 +7,7 @@
 import numpy as np
 
 import rasa.utils.io as io_utils
-from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
+from rasa.constants import DOCS_URL_TRAINING_DATA_NLU, DOCS_URL_MIGRATION_GUIDE
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.components import Component
@@ -69,7 +69,7 @@ def __init__(
             "'SklearnIntentClassifier' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
             category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
 
     @classmethod
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index c664e857d9da..b09e76604219 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -2,6 +2,7 @@
 import typing
 from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple, Type
 
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults, InvalidConfigError
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
@@ -131,7 +132,7 @@ def _check_deprecated_attributes(component: "Component") -> None:
             f"which is deprecated. There is no need to specify "
             f"the list of attributes that a component provides.",
             category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
     if hasattr(component, "requires"):
         raise_warning(
@@ -140,7 +141,7 @@ def _check_deprecated_attributes(component: "Component") -> None:
             f"to specify which components are required to be present "
             f"in the pipeline by this component.",
             category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
 
 
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index b59af0ee5990..00bfdf12a56b 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -2,10 +2,14 @@
 import logging
 import os
 import ruamel.yaml as yaml
-from typing import Any, Dict, List, Optional, Text, Union, Tuple
+from typing import Any, Dict, List, Optional, Text, Union
 
 import rasa.utils.io
-from rasa.constants import DEFAULT_CONFIG_PATH, DOCS_URL_PIPELINE
+from rasa.constants import (
+    DEFAULT_CONFIG_PATH,
+    DOCS_URL_PIPELINE,
+    DOCS_URL_MIGRATION_GUIDE,
+)
 from rasa.nlu.utils import json_to_string
 import rasa.utils.common as common_utils
 
@@ -124,7 +128,7 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
                     "the components you want to use directly to your configuration "
                     "file.",
                     FutureWarning,
-                    docs="https://rasa.com/docs/rasa/migration-guide/",
+                    docs=DOCS_URL_MIGRATION_GUIDE,
                 )
 
                 # replaces the template with the actual components
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 780a3a47960b..f92e339a427e 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -14,7 +14,11 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import TOKENS_NAMES, TEXT, DENSE_FEATURE_NAMES, ENTITIES
-from rasa.constants import DOCS_URL_TRAINING_DATA_NLU, DOCS_URL_COMPONENTS
+from rasa.constants import (
+    DOCS_URL_TRAINING_DATA_NLU,
+    DOCS_URL_COMPONENTS,
+    DOCS_URL_MIGRATION_GUIDE,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -106,8 +110,7 @@ def __init__(
         common_utils.raise_warning(
             "'CRFEntityExtractor' is deprecated and will be removed in version "
             "2.0. Use 'DIETClassifier' instead.",
-            category=FutureWarning,
-            docs="https://rasa.com/docs/rasa/migration-guide/",
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
 
     def _validate_configuration(self) -> None:

From c0afb86f8c358f193265270c3bc82503d8bbc38f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 15:37:08 +0100
Subject: [PATCH 595/633] refactor loss and f1 helpers

---
 rasa/nlu/classifiers/diet_classifier.py | 91 ++++++++++++++-----------
 1 file changed, 52 insertions(+), 39 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b77e7d56b56b..579ed97d12ee 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1183,12 +1183,14 @@ def _combine_sparse_dense_features(
     def _features_as_seq_ids(
         self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
     ) -> tf.Tensor:
-        # if there are dense features it's enough
+        """Creates dense labels for negative sampling."""
+
+        # if there are dense features - we can use them
         for f in features:
             if not isinstance(f, tf.SparseTensor):
                 return tf.stop_gradient(f)
 
-        # we need dense labels for negative sampling
+        # use additional sparse to dense layer
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 return tf.stop_gradient(
@@ -1216,29 +1218,27 @@ def _create_sequence(
         sequence_ids: bool = False,
     ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
         if sequence_ids:
-            x_seq_ids = self._features_as_seq_ids(features, name)
+            seq_ids = self._features_as_seq_ids(features, name)
         else:
-            x_seq_ids = None
+            seq_ids = None
 
-        x = self._combine_sparse_dense_features(
+        inputs = self._combine_sparse_dense_features(
             features, mask, name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT]
         )
 
-        pre = self._tf_layers[f"ffnn.{name}"](x, self._training)
+        x = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
 
         if masked_lm_loss:
-            pre, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
-                pre, mask, self._training
+            x, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
+                x, mask, self._training
             )
         else:
             lm_mask_bool = None
 
-        transformed = self._tf_layers[f"{name}_transformer"](
-            pre, 1 - mask, self._training
-        )
-        transformed = tfa.activations.gelu(transformed)
+        outputs = self._tf_layers[f"{name}_transformer"](x, 1 - mask, self._training)
+        outputs = tfa.activations.gelu(outputs)
 
-        return transformed, x, x_seq_ids, lm_mask_bool
+        return outputs, inputs, seq_ids, lm_mask_bool
 
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_IDS][0]
@@ -1253,15 +1253,34 @@ def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
 
     @staticmethod
     def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
-        last_index = tf.maximum(0, sequence_lengths - 1)
-        idxs = tf.stack([tf.range(tf.shape(last_index)[0]), last_index], axis=1)
-        return tf.gather_nd(x, idxs)
+        last_sequence_index = tf.maximum(0, sequence_lengths - 1)
+        batch_index = tf.range(tf.shape(last_sequence_index)[0])
+
+        indices = tf.stack([batch_index, last_sequence_index], axis=1)
+        return tf.gather_nd(x, indices)
+
+    def _f1_score_from_ids(
+        self, tag_ids: tf.Tensor, pred_ids: tf.Tensor, mask: tf.Tensor
+    ) -> tf.Tensor:
+        """Calculates f1 score for train predictions"""
+
+        mask_bool = tf.cast(mask[:, :, 0], tf.bool)
+        # pick only non padding values and flatten sequences
+        tag_ids_flat = tf.boolean_mask(tag_ids, mask_bool)
+        pred_ids_flat = tf.boolean_mask(pred_ids, mask_bool)
+        # set `0` prediction to not a prediction
+        tag_ids_flat_one_hot = tf.one_hot(tag_ids_flat - 1, self._num_tags - 1)
+        pred_ids_flat_one_hot = tf.one_hot(pred_ids_flat - 1, self._num_tags - 1)
+
+        return self._tf_layers["crf_f1_score"](
+            tag_ids_flat_one_hot, pred_ids_flat_one_hot
+        )
 
     def _mask_loss(
         self,
-        a_transformed: tf.Tensor,
-        a: tf.Tensor,
-        a_seq_ids: tf.Tensor,
+        outputs: tf.Tensor,
+        inputs: tf.Tensor,
+        seq_ids: tf.Tensor,
         lm_mask_bool: tf.Tensor,
         name: Text,
     ) -> tf.Tensor:
@@ -1273,15 +1292,16 @@ def _mask_loss(
         )
 
         lm_mask_bool = tf.squeeze(lm_mask_bool, -1)
-        a_t_masked = tf.boolean_mask(a_transformed, lm_mask_bool)
-        a_masked = tf.boolean_mask(a, lm_mask_bool)
-        a_masked_ids = tf.boolean_mask(a_seq_ids, lm_mask_bool)
+        # pick elements that were masked
+        outputs = tf.boolean_mask(outputs, lm_mask_bool)
+        inputs = tf.boolean_mask(inputs, lm_mask_bool)
+        ids = tf.boolean_mask(seq_ids, lm_mask_bool)
 
-        a_t_masked_embed = self._tf_layers[f"embed.{name}_lm_mask"](a_t_masked)
-        a_masked_embed = self._tf_layers[f"embed.{name}_golden_token"](a_masked)
+        outputs_embed = self._tf_layers[f"embed.{name}_lm_mask"](outputs)
+        inputs_embed = self._tf_layers[f"embed.{name}_golden_token"](inputs)
 
         return self._tf_layers[f"loss.{name}_mask"](
-            a_t_masked_embed, a_masked_embed, a_masked_ids, a_masked_embed, a_masked_ids
+            outputs_embed, inputs_embed, ids, inputs_embed, ids
         )
 
     def _label_loss(
@@ -1297,12 +1317,16 @@ def _label_loss(
         )
 
     def _entity_loss(
-        self, a: tf.Tensor, tag_ids: tf.Tensor, mask: tf.Tensor, sequence_lengths
+        self,
+        outputs: tf.Tensor,
+        tag_ids: tf.Tensor,
+        mask: tf.Tensor,
+        sequence_lengths: tf.Tensor,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
 
         sequence_lengths = sequence_lengths - 1  # remove cls token
         tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
-        logits = self._tf_layers["embed.logits"](a)
+        logits = self._tf_layers["embed.logits"](outputs)
 
         # should call first to build weights
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
@@ -1310,18 +1334,7 @@ def _entity_loss(
         loss = self._tf_layers["crf"].loss(logits, tag_ids, sequence_lengths)
         # pytype: enable=attribute-error
 
-        # calculate f1 score for train predictions
-        mask_bool = tf.cast(mask[:, :, 0], tf.bool)
-        # pick only non padding values and flatten sequences
-        tag_ids_flat = tf.boolean_mask(tag_ids, mask_bool)
-        pred_ids_flat = tf.boolean_mask(pred_ids, mask_bool)
-        # set `0` prediction to not a prediction
-        tag_ids_flat_one_hot = tf.one_hot(tag_ids_flat - 1, self._num_tags - 1)
-        pred_ids_flat_one_hot = tf.one_hot(pred_ids_flat - 1, self._num_tags - 1)
-
-        f1 = self._tf_layers["crf_f1_score"](
-            tag_ids_flat_one_hot, pred_ids_flat_one_hot
-        )
+        f1 = self._f1_score_from_ids(tag_ids, pred_ids, mask)
 
         return loss, f1
 

From 4de13a85eca9b54256c11db645c8571eebbbbad9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 15:42:22 +0100
Subject: [PATCH 596/633] review comments on featurizers

---
 rasa/nlu/extractors/crf_entity_extractor.py   |  6 ++--
 .../dense_featurizer/convert_featurizer.py    | 17 +++++-----
 .../dense_featurizer/lm_featurizer.py         | 34 +++++++++++--------
 .../nlu/utils/hugging_face/hf_transformers.py |  8 +++--
 4 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index f92e339a427e..6caccf8ddd29 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -409,11 +409,9 @@ def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any
                                 feature_name = prefix + ":" + feature + ":" + p_name
                                 word_features[feature_name] = matched
                             # pytype: enable=attribute-error
-                        elif feature == "pos" or feature == "pos2":
-                            if word is None:
-                                continue
+                        elif word and (feature == "pos" or feature == "pos2"):
                             value = self.function_dict[feature](word)
-                            word_features[prefix + ":" + feature] = value
+                            word_features[f"{prefix}:{feature}"] = value
                         else:
                             # append each feature to a feature vector
                             value = self.function_dict[feature](word)
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index e3c9a9d017ef..b49d79fa1069 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -27,16 +27,16 @@
 class ConveRTFeaturizer(DenseFeaturizer):
     """Featurizer using ConveRT model.
 
-        Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
-        model from TFHub and computes sentence and sequence level feature representations
-        for dense featurizable attributes of each message object.
+    Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
+    model from TFHub and computes sentence and sequence level feature representations
+    for dense featurizable attributes of each message object.
     """
 
     @classmethod
     def required_components(cls) -> List[Type[Component]]:
         return [ConveRTTokenizer]
 
-    def _load_from_tfhub(self, model_url: Text):
+    def _load_from_tf_hub(self, model_url: Text):
         """Load model from TFHub"""
 
         import tensorflow_hub as tfhub
@@ -47,6 +47,7 @@ def _load_model(self) -> None:
         """Load model from cache if possible, otherwise from TFHub"""
 
         # needed in order to load model
+        # noinspection PyUnresolvedReferences
         import tensorflow_text
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
@@ -54,10 +55,10 @@ def _load_model(self) -> None:
         # required to take care of cases when other files are already
         # stored in the default TFHUB_CACHE_DIR
         try:
-            self._load_from_tfhub(model_url)
+            self._load_from_tf_hub(model_url)
         except OSError:
             os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
-            self._load_from_tfhub(model_url)
+            self._load_from_tf_hub(model_url)
 
         self.sentence_encoding_signature = self.module.signatures["default"]
         self.sequence_encoding_signature = self.module.signatures["encode_sequence"]
@@ -207,11 +208,11 @@ def train(
                 filter(lambda x: x.get(attribute), training_data.training_examples)
             )
 
-            pbar = tqdm(
+            progress_bar = tqdm(
                 range(0, len(non_empty_examples), batch_size),
                 desc=attribute.capitalize() + " batches",
             )
-            for batch_start_index in pbar:
+            for batch_start_index in progress_bar:
                 batch_end_index = min(
                     batch_start_index + batch_size, len(non_empty_examples)
                 )
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 5e32ecd84f47..5afaceec2fb0 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -20,8 +20,8 @@
 class LanguageModelFeaturizer(DenseFeaturizer):
     """Featurizer using transformer based language models.
 
-        Uses the output of HFTransformersNLP component to set the sequence and sentence
-        level representations for dense featurizable attributes of each message object.
+    Uses the output of HFTransformersNLP component to set the sequence and sentence
+    level representations for dense featurizable attributes of each message object.
     """
 
     @classmethod
@@ -39,26 +39,32 @@ def train(
             for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                 self._set_lm_features(example, attribute)
 
-    def get_doc(self, message: Message, attribute: Text) -> Any:
-
+    def _get_doc(self, message: Message, attribute: Text) -> Any:
+        """
+        Get the language model doc. A doc consists of
+        {'token_ids': ..., 'tokens': ...,
+        'sequence_features': ..., 'sentence_features': ...}
+        """
         return message.get(LANGUAGE_MODEL_DOCS[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
-
+        """Sets the dense features from the language model doc to the incoming
+        message."""
         self._set_lm_features(message)
 
     def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None:
         """Adds the precomputed word vectors to the messages features."""
+        doc = self._get_doc(message, attribute)
 
-        doc = self.get_doc(message, attribute)
+        if doc is None:
+            return
 
-        if doc is not None:
-            sequence_features = doc[SEQUENCE_FEATURES]
-            sentence_features = doc[SENTENCE_FEATURES]
+        sequence_features = doc[SEQUENCE_FEATURES]
+        sentence_features = doc[SENTENCE_FEATURES]
 
-            features = np.concatenate([sequence_features, sentence_features])
+        features = np.concatenate([sequence_features, sentence_features])
 
-            features = self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[attribute]
-            )
-            message.set(DENSE_FEATURE_NAMES[attribute], features)
+        features = self._combine_with_existing_dense_features(
+            message, features, DENSE_FEATURE_NAMES[attribute]
+        )
+        message.set(DENSE_FEATURE_NAMES[attribute], features)
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 6f7956d96715..a47ca6545d76 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -214,8 +214,9 @@ def _add_padding_to_batch(
             actual_sequence_lengths.append(len(example_token_ids))
             max_seq_len = max(max_seq_len, len(example_token_ids))
         # Add padding according to max_seq_len
-        # Some models don't contain pad token, we use unknown token as padding token.This doesn't affect the computation
-        # since we compute an attention mask anyways.
+        # Some models don't contain pad token, we use unknown token as padding token.
+        # This doesn't affect the computation since we compute an attention mask
+        # anyways.
         for example_token_ids in batch_token_ids:
             padded_token_ids.append(
                 example_token_ids
@@ -297,7 +298,8 @@ def _get_docs_for_batch(
             batch_sequence_features,
         ) = self._get_model_features_for_batch(batch_token_ids)
 
-        # A doc consists of {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
+        # A doc consists of
+        # {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
         batch_docs = []
         for index in range(len(batch_examples)):
             doc = {

From cf96b616314155850a231bb52fed1b4686a2b7ed Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 15:48:37 +0100
Subject: [PATCH 597/633] fix docstrings in components

---
 rasa/nlu/components.py | 99 ++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 47 deletions(-)

diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index b09e76604219..52dc535c546e 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -17,10 +17,10 @@ def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
     """Tries to import all package names and returns the packages where it failed.
 
     Args:
-        package_names: the package names to import
+        package_names: The package names to import.
 
     Returns:
-        package names that could not be imported
+        Package names that could not be imported.
     """
 
     import importlib
@@ -38,7 +38,7 @@ def validate_requirements(component_names: List[Text]) -> None:
     """Validates that all required importable python packages are installed.
 
     Args:
-        component_names: the list of component names
+        component_names: The list of component names.
     """
 
     from rasa.nlu import registry
@@ -66,7 +66,7 @@ def validate_empty_pipeline(pipeline: List["Component"]) -> None:
     """Ensures the pipeline is not empty.
 
     Args:
-        pipeline: the list of components in the pipeline
+        pipeline: the list of the :class:`rasa.nlu.components.Component`.
     """
 
     if len(pipeline) == 0:
@@ -79,11 +79,11 @@ def validate_empty_pipeline(pipeline: List["Component"]) -> None:
         )
 
 
-def validate_tokenizers(pipeline: List["Component"]) -> None:
+def validate_only_one_tokenizer_is_used(pipeline: List["Component"]) -> None:
     """Validates that only one tokenizer is present in the pipeline.
 
     Args:
-        pipeline: the list of the :class:`rasa.nlu.components.Component`
+        pipeline: the list of the :class:`rasa.nlu.components.Component`.
     """
 
     from rasa.nlu.tokenizers.tokenizer import Tokenizer
@@ -106,11 +106,11 @@ def _required_component_in_pipeline(
     """Checks that required component present in the pipeline.
 
     Args:
-        required_component: a class name of the required component
-        pipeline: the list of the :class:`rasa.nlu.components.Component`
+        required_component: A class name of the required component.
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
 
     Returns:
-        `True` if required_component is in the pipeline, `False` otherwise
+        `True` if required_component is in the pipeline, `False` otherwise.
     """
 
     for previous_component in pipeline:
@@ -123,7 +123,7 @@ def _check_deprecated_attributes(component: "Component") -> None:
     """Checks that the component doesn't have deprecated attributes.
 
     Args:
-        component: A class name of the component
+        component: The :class:`rasa.nlu.components.Component`.
     """
 
     if hasattr(component, "provides"):
@@ -149,7 +149,7 @@ def validate_required_components(pipeline: List["Component"]) -> None:
     """Validates that all required components are present in the pipeline.
 
     Args:
-        pipeline: the list of the :class:`rasa.nlu.components.Component`
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
     """
 
     for i, component in enumerate(pipeline):
@@ -171,11 +171,11 @@ def validate_pipeline(pipeline: List["Component"]) -> None:
     """Validates the pipeline.
 
     Args:
-        pipeline: the list of the :class:`rasa.nlu.components.Component`
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
     """
 
     validate_empty_pipeline(pipeline)
-    validate_tokenizers(pipeline)
+    validate_only_one_tokenizer_is_used(pipeline)
     validate_required_components(pipeline)
 
 
@@ -185,8 +185,8 @@ def validate_required_components_from_data(
     """Validates that all components are present in the pipeline based on data.
 
     Args:
-        pipeline: the list of the :class:`rasa.nlu.components.Component`
-        data: the :class:`rasa.nlu.training_data.training_data.TrainingData`
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
+        data: The :class:`rasa.nlu.training_data.training_data.TrainingData`.
     """
 
     from rasa.nlu.selectors.response_selector import ResponseSelector
@@ -285,7 +285,11 @@ def name(self):
     # Listed components should appear before the component itself in the pipeline.
     @classmethod
     def required_components(cls) -> List[Type["Component"]]:
-        """Specify which components need to be present in the pipeline."""
+        """Specify which components need to be present in the pipeline.
+
+        Returns:
+            The list of class names of required components.
+        """
 
         return []
 
@@ -327,7 +331,7 @@ def required_packages(cls) -> List[Text]:
         if a required package is not installed.
 
         Returns:
-            a list of required packages
+            The list of required package names.
         """
 
         return []
@@ -351,10 +355,10 @@ def load(
         calls to components previous to this one.
 
         Args:
-            meta: any configuration parameter related to the model
-            model_dir: the directory to load the component from
-            model_metadata: the model's :class:`rasa.nlu.model.Metadata`
-            cached_component: the cached component
+            meta: Any configuration parameter related to the model.
+            model_dir: The directory to load the component from.
+            model_metadata: The model's :class:`rasa.nlu.model.Metadata`.
+            cached_component: The cached component.
 
         Returns:
             the loaded component
@@ -374,11 +378,11 @@ def create(
         Method can access all configuration parameters.
 
         Args:
-            component_config: the components configuration parameters
-            config: the model configuration parameters
+            component_config: The components configuration parameters.
+            config: The model configuration parameters.
 
         Returns:
-            the created component
+            The created component.
         """
 
         # Check language supporting
@@ -403,7 +407,7 @@ def provide_context(self) -> Optional[Dict[Text, Any]]:
         (e.g. loading word vectors for the pipeline).
 
         Returns:
-            the updated component configuration
+            The updated component configuration.
         """
 
         pass
@@ -426,8 +430,9 @@ def train(
         of components previous to this one.
 
         Args:
-            training_data: the :class:`rasa.nlu.training_data.training_data.TrainingData`
-            config: the model configuration parameters
+            training_data:
+                The :class:`rasa.nlu.training_data.training_data.TrainingData`.
+            config: The model configuration parameters.
 
         """
 
@@ -446,7 +451,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         of components previous to this one.
 
         Args:
-            message: the :class:`rasa.nlu.training_data.message.Message` to process
+            message: The :class:`rasa.nlu.training_data.message.Message` to process.
 
         """
 
@@ -456,11 +461,11 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
         """Persist this component to disk for future loading.
 
         Args:
-            file_name: the file name of the model
-            model_dir: the directory to store the model to
+            file_name: The file name of the model.
+            model_dir: The directory to store the model to.
 
         Returns:
-            an optional dictionary with any information about the stored model
+            An optional dictionary with any information about the stored model.
         """
 
         pass
@@ -477,11 +482,11 @@ def cache_key(
         metadata creates the same key.
 
         Args:
-            component_meta: the component configuration
-            model_metadata: the component's :class:`rasa.nlu.model.Metadata`
+            component_meta: The component configuration.
+            model_metadata: The component's :class:`rasa.nlu.model.Metadata`.
 
         Returns:
-            a unique caching key
+            A unique caching key.
         """
 
         return None
@@ -509,8 +514,8 @@ def prepare_partial_processing(
         be safely used to process messages).
 
         Args:
-            pipeline: the list of components
-            context: the context of processing
+            pipeline: The list of components.
+            context: The context of processing.
 
         """
 
@@ -525,10 +530,10 @@ def partially_process(self, message: Message) -> Message:
         previous to this one in the pipeline.
 
         Args:
-            message: the :class:`rasa.nlu.training_data.message.Message` to process
+            message: The :class:`rasa.nlu.training_data.message.Message` to process.
 
         Returns:
-            the processed :class:`rasa.nlu.training_data.message.Message`
+            The processed :class:`rasa.nlu.training_data.message.Message`.
 
         """
 
@@ -547,10 +552,10 @@ def can_handle_language(cls, language: Hashable) -> bool:
         determine which language is supported.)
 
         Args:
-            language: the language to check
+            language: The language to check.
 
         Returns:
-            `True` if component can handle specific language, `False` otherwise
+            `True` if component can handle specific language, `False` otherwise.
         """
 
         # if language_list is set to `None` it means: support all languages
@@ -618,14 +623,14 @@ def load_component(
 
         Args:
             component_meta:
-                the metadata of the component to load in the pipeline
+                The metadata of the component to load in the pipeline.
             model_dir:
-                the directory to read the model from
+                The directory to read the model from.
             model_metadata (Metadata):
-                the model's :class:`rasa.nlu.model.Metadata`
+                The model's :class:`rasa.nlu.model.Metadata`.
 
         Returns:
-            Component: the loaded component.
+            The loaded component.
         """
 
         from rasa.nlu import registry
@@ -657,11 +662,11 @@ def create_component(
         calls `create` to create a new component.
 
         Args:
-            component_config: the component configuration
-            cfg: the model configuration
+            component_config: The component configuration.
+            cfg: The model configuration.
 
         Returns:
-            the created component
+            The created component.
         """
 
         from rasa.nlu import registry

From 935b90df7dcd6ed4eba8308b6d92e7af85266750 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 15:52:13 +0100
Subject: [PATCH 598/633] review comments on lexical_syntactic_featuirzer.

---
 .../lexical_syntactic_featurizer.py           | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index d67fb7a98871..62b2a04d1390 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 
 import numpy as np
-import scipy.sparse
 from typing import Any, Dict, Optional, Text, List, Type
 
 from rasa.constants import DOCS_URL_COMPONENTS
@@ -19,8 +18,17 @@
 
 logger = logging.getLogger(__name__)
 
+END_OF_SENTENCE = "EOS"
+BEGIN_OF_SENTENCE = "BOS"
+
 
 class LexicalSyntacticFeaturizer(SparseFeaturizer):
+    """Creates features for entity extraction.
+
+    Moves with a sliding window over every token in the user message and creates
+    features according to the configuration.
+    """
+
     @classmethod
     def required_components(cls) -> List[Type[Component]]:
         return [Tokenizer]
@@ -102,8 +110,8 @@ def _create_feature_to_idx_dict(
         all_features = []
         for example in training_data.training_examples:
             # [:-1] to remove CLS token
-            tokens = example.get(TOKENS_NAMES[TEXT])[:-1]
-            all_features.append(self._tokens_to_features(tokens))
+            tokens_without_cls = example.get(TOKENS_NAMES[TEXT])[:-1]
+            all_features.append(self._tokens_to_features(tokens_without_cls))
 
         # build vocabulary of features
         feature_vocabulary = self._build_feature_vocabulary(all_features)
@@ -148,6 +156,7 @@ def _build_feature_vocabulary(
     def _create_sparse_features(self, message: Message) -> None:
         """Convert incoming messages into sparse features using the configured
         features."""
+        import scipy.sparse
 
         # [:-1] to remove CLS token
         tokens = message.get(TOKENS_NAMES[TEXT])[:-1]
@@ -194,7 +203,7 @@ def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
                 prefix = prefixes[current_feature_idx]
 
                 for feature in configured_features[current_feature_idx]:
-                    token_features[prefix + ":" + feature] = self._get_feature_value(
+                    token_features[f"{prefix}:{feature}"] = self._get_feature_value(
                         feature, token, token_idx, pointer_position, len(tokens)
                     )
 
@@ -213,14 +222,15 @@ def _features_to_one_hot(
             [len(sentence_features) + 1, self.number_of_features]
         )
 
-        for token_idx, toke_features in enumerate(sentence_features):
-            for feature_name, feature_value in toke_features.items():
+        for token_idx, token_features in enumerate(sentence_features):
+            for feature_name, feature_value in token_features.items():
+                feature_value_str = str(feature_value)
                 if (
                     feature_name in self.feature_to_idx_dict
-                    and str(feature_value) in self.feature_to_idx_dict[feature_name]
+                    and feature_value_str in self.feature_to_idx_dict[feature_name]
                 ):
                     feature_idx = self.feature_to_idx_dict[feature_name][
-                        str(feature_value)
+                        feature_value_str
                     ]
                     one_hot_feature_vector[token_idx][feature_idx] = 1
 
@@ -237,10 +247,10 @@ def _get_feature_value(
         pointer_position: int,
         token_length: int,
     ) -> Any:
-        if feature == "EOS":
+        if feature == END_OF_SENTENCE:
             return token_idx + pointer_position == token_length - 1
 
-        if feature == "BOS":
+        if feature == BEGIN_OF_SENTENCE:
             return token_idx + pointer_position == 0
 
         if feature not in self.function_dict:

From 4f657fb04283f98783a0b9479f8acdfb821bb2e3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 16:00:45 +0100
Subject: [PATCH 599/633] review comments on convert

---
 .../dense_featurizer/convert_featurizer.py    | 31 +++----------------
 rasa/nlu/tokenizers/convert_tokenizer.py      | 24 +-------------
 rasa/utils/train_utils.py                     | 19 ++++++++++++
 3 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index b49d79fa1069..1fa9de8d3210 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -17,8 +17,8 @@
 )
 import numpy as np
 import tensorflow as tf
-import os
 
+import rasa.utils.train_utils as train_utils
 import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
@@ -36,39 +36,16 @@ class ConveRTFeaturizer(DenseFeaturizer):
     def required_components(cls) -> List[Type[Component]]:
         return [ConveRTTokenizer]
 
-    def _load_from_tf_hub(self, model_url: Text):
-        """Load model from TFHub"""
-
-        import tensorflow_hub as tfhub
-
-        self.module = tfhub.load(model_url)
-
-    def _load_model(self) -> None:
-        """Load model from cache if possible, otherwise from TFHub"""
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
-        # needed in order to load model
-        # noinspection PyUnresolvedReferences
-        import tensorflow_text
+        super(ConveRTFeaturizer, self).__init__(component_config)
 
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-
-        # required to take care of cases when other files are already
-        # stored in the default TFHUB_CACHE_DIR
-        try:
-            self._load_from_tf_hub(model_url)
-        except OSError:
-            os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
-            self._load_from_tf_hub(model_url)
+        self.module = train_utils.load_tf_hub_model(model_url)
 
         self.sentence_encoding_signature = self.module.signatures["default"]
         self.sequence_encoding_signature = self.module.signatures["encode_sequence"]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-
-        super(ConveRTFeaturizer, self).__init__(component_config)
-
-        self._load_model()
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["tensorflow_text", "tensorflow_hub"]
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index d83255c9cbb6..5727a258641f 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -30,30 +30,8 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         super().__init__(component_config)
 
-        self._load_model()
-
-    def _load_from_tfhub(self, model_url: Text):
-
-        import tensorflow_hub as tfhub
-
-        self.module = tfhub.load(model_url)
-
-    def _load_model(self):
-
-        # needed to load the ConveRT model
-        import tensorflow_text
-
-        import os
-
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-
-        # required to take care of cases when other files are already
-        # stored in the default TFHUB_CACHE_DIR
-        try:
-            self._load_from_tfhub(model_url)
-        except OSError:
-            os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
-            self._load_from_tfhub(model_url)
+        self.module = train_utils.load_tf_hub_model(model_url)
 
         self.tokenize_signature = self.module.signatures["tokenize"]
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9fd9bbc668fe..989737bf1986 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -147,6 +147,25 @@ def update_evaluation_parameters(config: Dict[Text, Any]) -> Dict[Text, Any]:
     return config
 
 
+def load_tf_hub_model(model_url: Text) -> Any:
+    """Load model from cache if possible, otherwise from TFHub"""
+
+    import tensorflow_hub as tfhub
+
+    # needed to load the ConveRT model
+    # noinspection PyUnresolvedReferences
+    import tensorflow_text
+    import os
+
+    # required to take care of cases when other files are already
+    # stored in the default TFHUB_CACHE_DIR
+    try:
+        return tfhub.load(model_url)
+    except OSError:
+        os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
+        return tfhub.load(model_url)
+
+
 def _replace_deprecated_option(
     old_option: Text, new_option: Union[Text, List[Text]], config: Dict[Text, Any]
 ) -> Dict[Text, Any]:

From 208f5e466244a60021a563618b6e9339f1910832 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 16:13:27 +0100
Subject: [PATCH 600/633] review comments on hugging face components

---
 .../nlu/utils/hugging_face/hf_transformers.py | 25 ++++-------
 .../transformers_pre_post_processors.py       | 43 +++++++------------
 2 files changed, 24 insertions(+), 44 deletions(-)

diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index a47ca6545d76..3f33b9b8cf47 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -38,8 +38,7 @@ class HFTransformersNLP(Component):
         "model_weights": None,
     }
 
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(HFTransformersNLP, self).__init__(component_config)
 
         self._load_model()
@@ -94,7 +93,6 @@ def required_packages(cls) -> List[Text]:
         return ["transformers"]
 
     def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
-
         split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)
 
         split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids)
@@ -104,7 +102,6 @@ def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
     def _add_lm_specific_special_tokens(
         self, token_ids: List[List[int]]
     ) -> List[List[int]]:
-
         from rasa.nlu.utils.hugging_face.registry import (
             model_special_tokens_pre_processors,
         )
@@ -116,7 +113,6 @@ def _add_lm_specific_special_tokens(
         return augmented_tokens
 
     def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
-
         from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners
 
         return model_tokens_cleaners[self.model_name](token_strings)
@@ -157,17 +153,15 @@ def _tokenize_example(
         token_ids_out = []
 
         for token in tokens_in:
-            token_start, token_end, token_text = token.start, token.end, token.text
-
             # use lm specific tokenizer to further tokenize the text
-            split_token_ids, split_token_strings = self._lm_tokenize(token_text)
+            split_token_ids, split_token_strings = self._lm_tokenize(token.text)
 
             split_token_strings = self._lm_specific_token_cleanup(split_token_strings)
 
             token_ids_out += split_token_ids
 
             tokens_out += train_utils.align_tokens(
-                split_token_strings, token_end, token_start
+                split_token_strings, token.end, token.start
             )
 
         return tokens_out, token_ids_out
@@ -193,10 +187,10 @@ def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.ndarray:
 
         attention_mask = []
         max_seq_length = max(actual_sequence_lengths)
-        for index in range(len(actual_sequence_lengths)):
-            example_seq_length = actual_sequence_lengths[index]
+        for actual_sequence_length in actual_sequence_lengths:
             attention_mask.append(
-                [1] * example_seq_length + [0] * (max_seq_length - example_seq_length)
+                [1] * actual_sequence_length
+                + [0] * (max_seq_length - actual_sequence_length)
             )
 
         attention_mask = np.array(attention_mask).astype(np.float32)
@@ -210,9 +204,11 @@ def _add_padding_to_batch(
         # Compute max length across examples
         max_seq_len = 0
         actual_sequence_lengths = []
+
         for example_token_ids in batch_token_ids:
             actual_sequence_lengths.append(len(example_token_ids))
             max_seq_len = max(max_seq_len, len(example_token_ids))
+
         # Add padding according to max_seq_len
         # Some models don't contain pad token, we use unknown token as padding token.
         # This doesn't affect the computation since we compute an attention mask
@@ -227,8 +223,7 @@ def _add_padding_to_batch(
     @staticmethod
     def _extract_nonpadded_embeddings(
         embeddings: np.ndarray, actual_sequence_lengths: List[int]
-    ):
-
+    ) -> np.ndarray:
         nonpadded_sequence_embeddings = []
         for index, embedding in enumerate(embeddings):
             unmasked_embedding = embedding[: actual_sequence_lengths[index]]
@@ -239,7 +234,6 @@ def _extract_nonpadded_embeddings(
     def _compute_batch_sequence_features(
         self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]]
     ) -> np.ndarray:
-
         model_outputs = self.model(
             np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
         )
@@ -253,7 +247,6 @@ def _compute_batch_sequence_features(
     def _get_model_features_for_batch(
         self, batch_token_ids: List[List[int]]
     ) -> Tuple[np.ndarray, np.ndarray]:
-
         # Let's first add tokenizer specific special tokens to all examples
         batch_token_ids_augmented = self._add_lm_specific_special_tokens(
             batch_token_ids
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
index 9273bd5a2d7a..27f02feedbde 100644
--- a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -4,7 +4,6 @@
 
 def bert_tokens_pre_processor(token_ids: List[int]) -> List[int]:
     """Add BERT style special tokens(CLS and SEP)"""
-
     BERT_CLS_ID = 101
     BERT_SEP_ID = 102
 
@@ -17,13 +16,11 @@ def bert_tokens_pre_processor(token_ids: List[int]) -> List[int]:
 
 
 def gpt_tokens_pre_processor(token_ids: List[int]) -> List[int]:
-
     return token_ids
 
 
 def xlnet_tokens_pre_processor(token_ids: List[int]) -> List[int]:
     """Add XLNET style special tokens"""
-
     XLNET_CLS_ID = 3
     XLNET_SEP_ID = 4
 
@@ -35,7 +32,6 @@ def xlnet_tokens_pre_processor(token_ids: List[int]) -> List[int]:
 
 def roberta_tokens_pre_processor(token_ids: List[int]) -> List[int]:
     """Add RoBERTa style special tokens"""
-
     ROBERTA_BEG_ID = 0
     ROBERTA_END_ID = 2
 
@@ -47,7 +43,6 @@ def roberta_tokens_pre_processor(token_ids: List[int]) -> List[int]:
 
 def xlm_tokens_pre_processor(token_ids: List[int]) -> List[int]:
     """Add RoBERTa style special tokens"""
-
     XLM_SEP_ID = 1
 
     token_ids.insert(0, XLM_SEP_ID)
@@ -59,11 +54,10 @@ def xlm_tokens_pre_processor(token_ids: List[int]) -> List[int]:
 def bert_embeddings_post_processor(
     sequence_embeddings: np.ndarray,
 ) -> Tuple[np.ndarray, np.ndarray]:
-    """Post process embeddings from BERT by removing CLS and SEP embeddings and
-    returning CLS
-
-    token embedding as sentence representation"""
+    """Post process embeddings from BERT
 
+    by removing CLS and SEP embeddings and returning CLS token embedding as
+    sentence representation"""
     sentence_embedding = sequence_embeddings[0]
     post_processed_embedding = sequence_embeddings[1:-1]
 
@@ -73,11 +67,10 @@ def bert_embeddings_post_processor(
 def gpt_embeddings_post_processor(
     sequence_embeddings: np.ndarray,
 ) -> Tuple[np.ndarray, np.ndarray]:
-    """Post process embeddings from GPT models by taking a mean over sequence
-    embeddings and
-
-    returning that as sentence representation"""
+    """Post process embeddings from GPT models
 
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation"""
     sentence_embedding = np.mean(sequence_embeddings, axis=0)
     post_processed_embedding = sequence_embeddings
 
@@ -87,12 +80,11 @@ def gpt_embeddings_post_processor(
 def xlnet_embeddings_post_processor(
     sequence_embeddings: np.ndarray,
 ) -> Tuple[np.ndarray, np.ndarray]:
-    """Post process embeddings from XLNet models by taking a mean over sequence
-    embeddings and
+    """Post process embeddings from XLNet models
 
-    returning that as sentence representation. Remove last two time steps corresponding
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation. Remove last two time steps corresponding
     to special tokens from the sequence embeddings."""
-
     post_processed_embedding = sequence_embeddings[:-2]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
 
@@ -102,10 +94,10 @@ def xlnet_embeddings_post_processor(
 def roberta_embeddings_post_processor(
     sequence_embeddings: np.ndarray,
 ) -> Tuple[np.ndarray, np.ndarray]:
-    """Post process embeddings from Roberta models by taking a mean over sequence
-    embeddings and
+    """Post process embeddings from Roberta models
 
-    returning that as sentence representation. Remove first and last time steps
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation. Remove first and last time steps
     corresponding to special tokens from the sequence embeddings."""
 
     post_processed_embedding = sequence_embeddings[1:-1]
@@ -117,12 +109,11 @@ def roberta_embeddings_post_processor(
 def xlm_embeddings_post_processor(
     sequence_embeddings: np.ndarray,
 ) -> Tuple[np.ndarray, np.ndarray]:
-    """Post process embeddings from XLM models by taking a mean over sequence
-    embeddings and
+    """Post process embeddings from XLM models
 
-    returning that as sentence representation. Remove first and last time steps
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation. Remove first and last time steps
     corresponding to special tokens from the sequence embeddings."""
-
     post_processed_embedding = sequence_embeddings[1:-1]
     sentence_embedding = np.mean(post_processed_embedding, axis=0)
 
@@ -132,7 +123,6 @@ def xlm_embeddings_post_processor(
 def bert_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
     """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token
     into sub-tokens"""
-
     tokens = [string.replace("##", "") for string in token_strings]
     return [string for string in tokens if string]
 
@@ -140,7 +130,6 @@ def bert_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
 def openaigpt_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
     """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a
     token into sub-tokens"""
-
     tokens = [string.replace("</w>", "") for string in token_strings]
     return [string for string in tokens if string]
 
@@ -148,7 +137,6 @@ def openaigpt_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
 def gpt2_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
     """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token
     into sub-tokens"""
-
     tokens = [string.replace("Ġ", "") for string in token_strings]
     return [string for string in tokens if string]
 
@@ -156,6 +144,5 @@ def gpt2_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
 def xlnet_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
     """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token
     into sub-tokens"""
-
     tokens = [string.replace("▁", "") for string in token_strings]
     return [string for string in tokens if string]

From c5b337d2e25d01d62bacf6d46fd05c6679d08ac8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 16:17:36 +0100
Subject: [PATCH 601/633] rename inverted tag and label dicts

---
 rasa/nlu/classifiers/diet_classifier.py       | 92 +++++++++++--------
 .../embedding_intent_classifier.py            |  8 +-
 rasa/nlu/selectors/response_selector.py       | 20 ++--
 3 files changed, 68 insertions(+), 52 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 579ed97d12ee..c2c44ac8f407 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -258,8 +258,8 @@ def required_packages(cls) -> List[Text]:
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        index_label_id_mapping: Optional[Dict[int, Text]] = None,
+        index_tag_id_mapping: Optional[Dict[int, Text]] = None,
         model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
@@ -276,8 +276,8 @@ def __init__(
         self._check_config_parameters()
 
         # transform numbers to labels
-        self.inverted_label_dict = inverted_label_dict
-        self.inverted_tag_dict = inverted_tag_dict
+        self.index_label_id_mapping = index_label_id_mapping
+        self.index_tag_id_mapping = index_tag_id_mapping
 
         self.model = model
 
@@ -302,7 +302,7 @@ def model_class() -> Type[RasaModel]:
 
     # training data helpers:
     @staticmethod
-    def _create_label_id_dict(
+    def _label_id_index_mapping(
         training_data: TrainingData, attribute: Text
     ) -> Dict[Text, int]:
         """Create label_id dictionary."""
@@ -314,7 +314,11 @@ def _create_label_id_dict(
             label_id: idx for idx, label_id in enumerate(sorted(distinct_label_ids))
         }
 
-    def _create_tag_id_dict(self, training_data: TrainingData) -> Dict[Text, int]:
+    @staticmethod
+    def _invert_mapping(mapping: Dict) -> Dict:
+        return {value: key for key, value in mapping.items()}
+
+    def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]:
         """Create tag_id dictionary"""
 
         if self.component_config[BILOU_FLAG]:
@@ -574,15 +578,17 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         if self.component_config[BILOU_FLAG]:
             bilou_utils.apply_bilou_schema(training_data)
 
-        label_id_dict = self._create_label_id_dict(training_data, attribute=INTENT)
-        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
+        label_id_index_mapping = self._label_id_index_mapping(
+            training_data, attribute=INTENT
+        )
+        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
 
         self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=INTENT
+            training_data, label_id_index_mapping, attribute=INTENT
         )
 
-        tag_id_dict = self._create_tag_id_dict(training_data)
-        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
+        tag_id_index_mapping = self._tag_id_index_mapping(training_data)
+        self.index_tag_id_mapping = self._invert_mapping(tag_id_index_mapping)
 
         label_attribute = (
             INTENT if self.component_config[INTENT_CLASSIFICATION] else None
@@ -590,12 +596,12 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
 
         model_data = self._create_model_data(
             training_data.training_examples,
-            label_id_dict,
-            tag_id_dict,
+            label_id_index_mapping,
+            tag_id_index_mapping,
             label_attribute=label_attribute,
         )
 
-        self.num_tags = len(self.inverted_tag_dict)
+        self.num_tags = len(self.index_tag_id_mapping)
 
         self._check_input_dimension_consistency(model_data)
 
@@ -639,7 +645,7 @@ def train(
         self.model = self.model_class()(
             data_signature=model_data.get_signature(),
             label_data=self._label_data,
-            inverted_tag_dict=self.inverted_tag_dict,
+            index_tag_id_mapping=self.index_tag_id_mapping,
             config=self.component_config,
         )
 
@@ -697,7 +703,7 @@ def _predict_label(
         # if X contains all zeros do not predict some label
         if label_ids.size > 0:
             label = {
-                "name": self.inverted_label_dict[label_ids[0]],
+                "name": self.index_label_id_mapping[label_ids[0]],
                 "confidence": message_sim[0],
             }
 
@@ -712,7 +718,7 @@ def _predict_label(
             ranking = list(zip(list(label_ids), message_sim))
             ranking = ranking[:output_length]
             label_ranking = [
-                {"name": self.inverted_label_dict[label_idx], "confidence": score}
+                {"name": self.index_label_id_mapping[label_idx], "confidence": score}
                 for label_idx, score in ranking
             ]
 
@@ -727,7 +733,7 @@ def _predict_entities(
         # load tf graph and session
         predictions = predict_out["e_ids"].numpy()
 
-        tags = [self.inverted_tag_dict[p] for p in predictions[0]]
+        tags = [self.index_tag_id_mapping[p] for p in predictions[0]]
 
         if self.component_config[BILOU_FLAG]:
             tags = bilou_utils.remove_bilou_prefixes(tags)
@@ -812,10 +818,12 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             model_dir / f"{file_name}.label_data.pkl", self._label_data
         )
         io_utils.json_pickle(
-            model_dir / f"{file_name}.inverted_label_dict.pkl", self.inverted_label_dict
+            model_dir / f"{file_name}.index_label_id_mapping.pkl",
+            self.index_label_id_mapping,
         )
         io_utils.json_pickle(
-            model_dir / f"{file_name}.inverted_tag_dict.pkl", self.inverted_tag_dict
+            model_dir / f"{file_name}.index_tag_id_mapping.pkl",
+            self.index_tag_id_mapping,
         )
         io_utils.json_pickle(
             model_dir / f"{file_name}.batch_tuple_sizes.pkl", self.batch_tuple_sizes
@@ -843,8 +851,8 @@ def load(
 
         (
             batch_tuple_sizes,
-            inv_label_dict,
-            inv_tag_dict,
+            index_label_id_mapping,
+            index_tag_id_mapping,
             label_data,
             meta,
             data_example,
@@ -852,12 +860,14 @@ def load(
 
         meta = train_utils.update_similarity_type(meta)
 
-        model = cls._load_model(inv_tag_dict, label_data, meta, data_example, model_dir)
+        model = cls._load_model(
+            index_tag_id_mapping, label_data, meta, data_example, model_dir
+        )
 
         return cls(
             component_config=meta,
-            inverted_label_dict=inv_label_dict,
-            inverted_tag_dict=inv_tag_dict,
+            index_label_id_mapping=index_label_id_mapping,
+            index_tag_id_mapping=index_tag_id_mapping,
             model=model,
             batch_tuple_sizes=batch_tuple_sizes,
         )
@@ -870,29 +880,29 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
 
         data_example = io_utils.pickle_load(model_dir / f"{file_name}.data_example.pkl")
         label_data = io_utils.pickle_load(model_dir / f"{file_name}.label_data.pkl")
-        inverted_label_dict = io_utils.json_unpickle(
-            model_dir / f"{file_name}.inverted_label_dict.pkl"
+        index_label_id_mapping = io_utils.json_unpickle(
+            model_dir / f"{file_name}.index_label_id_mapping.pkl"
         )
-        inverted_tag_dict = io_utils.json_unpickle(
-            model_dir / f"{file_name}.inverted_tag_dict.pkl"
+        index_tag_id_mapping = io_utils.json_unpickle(
+            model_dir / f"{file_name}.index_tag_id_mapping.pkl"
         )
         batch_tuple_sizes = io_utils.json_unpickle(
             model_dir / f"{file_name}.batch_tuple_sizes.pkl"
         )
 
         # jsonpickle converts dictionary keys to strings
-        inverted_label_dict = {
-            int(key): value for key, value in inverted_label_dict.items()
+        index_label_id_mapping = {
+            int(key): value for key, value in index_label_id_mapping.items()
         }
-        if inverted_tag_dict is not None:
-            inverted_tag_dict = {
-                int(key): value for key, value in inverted_tag_dict.items()
+        if index_tag_id_mapping is not None:
+            index_tag_id_mapping = {
+                int(key): value for key, value in index_tag_id_mapping.items()
             }
 
         return (
             batch_tuple_sizes,
-            inverted_label_dict,
-            inverted_tag_dict,
+            index_label_id_mapping,
+            index_tag_id_mapping,
             label_data,
             meta,
             data_example,
@@ -901,7 +911,7 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
     @classmethod
     def _load_model(
         cls,
-        inv_tag_dict: Dict[int, Text],
+        index_tag_id_mapping: Dict[int, Text],
         label_data: RasaModelData,
         meta: Dict[Text, Any],
         data_example: Dict[Text, List[np.ndarray]],
@@ -918,7 +928,7 @@ def _load_model(
             model_data_example,
             data_signature=model_data_example.get_signature(),
             label_data=label_data,
-            inverted_tag_dict=inv_tag_dict,
+            index_tag_id_mapping=index_tag_id_mapping,
             config=meta,
         )
 
@@ -946,7 +956,7 @@ def __init__(
         self,
         data_signature: Dict[Text, List[FeatureSignature]],
         label_data: RasaModelData,
-        inverted_tag_dict: Optional[Dict[int, Text]],
+        index_tag_id_mapping: Optional[Dict[int, Text]],
         config: Dict[Text, Any],
     ) -> None:
         super().__init__(name="DIET", random_seed=config[RANDOM_SEED])
@@ -966,7 +976,9 @@ def __init__(
         self.tf_label_data = self.batch_to_model_data_format(
             label_batch, label_data.get_signature()
         )
-        self._num_tags = len(inverted_tag_dict) if inverted_tag_dict is not None else 0
+        self._num_tags = (
+            len(index_tag_id_mapping) if index_tag_id_mapping is not None else 0
+        )
 
         # tf objects
         self._tf_layers = {}
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8135be00cef7..ed17bf4b8081 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -137,8 +137,8 @@ def required_components(cls) -> List[Type[Component]]:
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        index_label_id_mapping: Optional[Dict[int, Text]] = None,
+        index_tag_id_mapping: Optional[Dict[int, Text]] = None,
         model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
@@ -154,8 +154,8 @@ def __init__(
 
         super().__init__(
             component_config,
-            inverted_label_dict,
-            inverted_tag_dict,
+            index_label_id_mapping,
+            index_tag_id_mapping,
             model,
             batch_tuple_sizes,
         )
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 91f279176a0a..b1c28399ccac 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -190,8 +190,8 @@ def required_components(cls) -> List[Type[Component]]:
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        inverted_tag_dict: Optional[Dict[int, Text]] = None,
+        index_label_id_mapping: Optional[Dict[int, Text]] = None,
+        index_tag_id_mapping: Optional[Dict[int, Text]] = None,
         model: Optional[RasaModel] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
@@ -205,8 +205,8 @@ def __init__(
 
         super().__init__(
             component_config,
-            inverted_label_dict,
-            inverted_tag_dict,
+            index_label_id_mapping,
+            index_tag_id_mapping,
             model,
             batch_tuple_sizes,
         )
@@ -255,15 +255,19 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         if self.retrieval_intent:
             training_data = training_data.filter_by_intent(self.retrieval_intent)
 
-        label_id_dict = self._create_label_id_dict(training_data, attribute=RESPONSE)
-        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
+        label_id_index_mapping = self._label_id_index_mapping(
+            training_data, attribute=RESPONSE
+        )
+        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
 
         self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=RESPONSE
+            training_data, label_id_index_mapping, attribute=RESPONSE
         )
 
         model_data = self._create_model_data(
-            training_data.intent_examples, label_id_dict, label_attribute=RESPONSE
+            training_data.intent_examples,
+            label_id_index_mapping,
+            label_attribute=RESPONSE,
         )
 
         self._check_input_dimension_consistency(model_data)

From ea84dc94006fb1384a581bc68ffdc140b585dc36 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 16:20:47 +0100
Subject: [PATCH 602/633] remove _find_example_for_tag

---
 rasa/nlu/classifiers/diet_classifier.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c2c44ac8f407..6861d236fd8e 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -348,16 +348,6 @@ def _find_example_for_label(
                 return ex
         return None
 
-    @staticmethod
-    def _find_example_for_tag(
-        tag: Text, examples: List[Message], attribute: Text
-    ) -> Optional[Message]:
-        for ex in examples:
-            for e in ex.get(attribute):
-                if e["entity"] == tag:
-                    return ex
-        return None
-
     @staticmethod
     def _check_labels_features_exist(
         labels_example: List[Message], attribute: Text

From dc2d5a9a09e9359f4106ca3e207173462cbd8a92 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 16:27:22 +0100
Subject: [PATCH 603/633] remove setting numpy random seed in train

---
 rasa/nlu/classifiers/diet_classifier.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 6861d236fd8e..8fe8a91e17dc 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -609,9 +609,6 @@ def train(
     ) -> None:
         """Train the embedding intent classifier on a data set."""
 
-        # set numpy random seed
-        np.random.seed(self.component_config[RANDOM_SEED])
-
         model_data = self.preprocess_train_data(training_data)
         if model_data.is_empty():
             logger.debug(

From a306da7c744f507c912d4a793730ace6bf5b6991 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 16:30:13 +0100
Subject: [PATCH 604/633] review comments

---
 rasa/nlu/classifiers/diet_classifier.py                    | 6 +++---
 rasa/nlu/extractors/crf_entity_extractor.py                | 3 ++-
 .../sparse_featurizer/lexical_syntactic_featurizer.py      | 7 +++++--
 rasa/nlu/selectors/response_selector.py                    | 6 +++---
 rasa/nlu/tokenizers/spacy_tokenizer.py                     | 7 ++++++-
 rasa/utils/tensorflow/models.py                            | 6 +++---
 6 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 8fe8a91e17dc..f26b9e7a752b 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -31,7 +31,7 @@
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
 )
-from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.config import RasaNLUModelConfig, InvalidConfigError
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
@@ -980,13 +980,13 @@ def __init__(
 
     def _check_data(self) -> None:
         if TEXT_FEATURES not in self.data_signature:
-            raise ValueError(
+            raise InvalidConfigError(
                 f"No text features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
         if self.config[INTENT_CLASSIFICATION]:
             if LABEL_FEATURES not in self.data_signature:
-                raise ValueError(
+                raise InvalidConfigError(
                     f"No label features specified. "
                     f"Cannot train '{self.__class__.__name__}' model."
                 )
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 6caccf8ddd29..31495ecff5b2 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -6,6 +6,7 @@
 
 import rasa.nlu.utils.bilou_utils as bilou_utils
 import rasa.utils.common as common_utils
+from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Tokenizer
 from rasa.nlu.components import Component
@@ -520,7 +521,7 @@ def _from_text_to_crf(
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
-            tag = token.get("pos")
+            tag = token.get(POS_TAG_KEY)
             dense_features = (
                 text_dense_features[i] if text_dense_features is not None else []
             )
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 62b2a04d1390..9458345d0598 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -5,6 +5,7 @@
 import numpy as np
 from typing import Any, Dict, Optional, Text, List, Type
 
+from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.components import Component
 from rasa.nlu.tokenizers.tokenizer import Token
@@ -55,8 +56,10 @@ def required_components(cls) -> List[Type[Component]]:
         "suffix3": lambda token: token.text[-3:],
         "suffix2": lambda token: token.text[-2:],
         "suffix1": lambda token: token.text[-1:],
-        "pos": lambda token: token.data.get("pos") if "pos" in token.data else None,
-        "pos2": lambda token: token.data.get("pos")[:2]
+        "pos": lambda token: token.data.get(POS_TAG_KEY)
+        if POS_TAG_KEY in token.data
+        else None,
+        "pos2": lambda token: token.data.get(POS_TAG_KEY)[:2]
         if "pos" in token.data
         else None,
         "upper": lambda token: token.text.isupper(),
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index b1c28399ccac..ab302426308f 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -5,6 +5,7 @@
 
 from typing import Any, Dict, Optional, Text, Tuple, Union, List, Type
 
+from rasa.nlu.config import InvalidConfigError
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.nlu.components import Component
 from rasa.nlu.featurizers.featurizer import Featurizer
@@ -237,7 +238,6 @@ def _check_config_parameters(self) -> None:
     def _set_message_property(
         message: Message, prediction_dict: Dict[Text, Any], selector_key: Text
     ) -> None:
-
         message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
         message_selector_properties[selector_key] = prediction_dict
         message.set(
@@ -298,12 +298,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
 class DIET2DIET(DIET):
     def _check_data(self) -> None:
         if TEXT_FEATURES not in self.data_signature:
-            raise ValueError(
+            raise InvalidConfigError(
                 f"No text features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
         if LABEL_FEATURES not in self.data_signature:
-            raise ValueError(
+            raise InvalidConfigError(
                 f"No label features specified. "
                 f"Cannot train '{self.__class__.__name__}' model."
             )
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 76aa75be00c4..5e05aa88e219 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,6 +17,9 @@
     spacy = None
 
 
+POS_TAG_KEY = "pos"
+
+
 class SpacyTokenizer(Tokenizer):
     @classmethod
     def required_components(cls) -> List[Type[Component]]:
@@ -36,7 +39,9 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         doc = self.get_doc(message, attribute)
 
         return [
-            Token(t.text, t.idx, lemma=t.lemma_, data={"pos": self._tag_of_token(t)})
+            Token(
+                t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}
+            )
             for t in doc
         ]
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index d723278d288a..27680a9a2d4d 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -80,9 +80,9 @@ def fit(
         ) = self._get_tf_evaluation_functions(eager, evaluation_model_data)
 
         val_results = {}  # validation is not performed every epoch
-        pbar = tqdm(range(epochs), desc="Epochs", disable=disable)
+        progress_bar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
-        for ep in pbar:
+        for ep in progress_bar:
             ep_batch_size = self.linearly_increasing_batch_size(ep, batch_size, epochs)
 
             self._batch_loop(
@@ -103,7 +103,7 @@ def fit(
 
                 postfix_dict.update(val_results)
 
-            pbar.set_postfix(postfix_dict)
+            progress_bar.set_postfix(postfix_dict)
 
         self._training = None  # training phase should be defined when building a graph
         if not disable:

From be464958070215a8a8feafcf0d2310048f1d2d77 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 16:34:39 +0100
Subject: [PATCH 605/633] create no entity tag constant

---
 rasa/nlu/classifiers/diet_classifier.py     |  9 +++++----
 rasa/nlu/constants.py                       |  1 +
 rasa/nlu/extractors/crf_entity_extractor.py | 10 ++++++++--
 rasa/nlu/test.py                            | 17 ++++++++++------
 rasa/nlu/utils/bilou_utils.py               | 22 ++++++++++++++-------
 5 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 8fe8a91e17dc..29f0c1b74078 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -27,6 +27,7 @@
     INTENT,
     TEXT,
     ENTITIES,
+    NO_ENTITY_TAG,
     SPARSE_FEATURE_NAMES,
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
@@ -333,9 +334,9 @@ def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]:
         tag_id_dict = {
             tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
         }
-        # "O" corresponds to non-entity which should correspond to 0 index
+        # NO_ENTITY_TAG corresponds to non-entity which should correspond to 0 index
         # needed for correct prediction for padding
-        tag_id_dict["O"] = 0
+        tag_id_dict[NO_ENTITY_TAG] = 0
 
         return tag_id_dict
 
@@ -739,9 +740,9 @@ def _convert_tags_to_entities(
         text: Text, tokens: List[Token], tags: List[Text]
     ) -> List[Dict[Text, Any]]:
         entities = []
-        last_tag = "O"
+        last_tag = NO_ENTITY_TAG
         for token, tag in zip(tokens, tags):
-            if tag == "O":
+            if tag == NO_ENTITY_TAG:
                 last_tag = tag
                 continue
 
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 16f1b261b242..bc2334aa7fc1 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -8,6 +8,7 @@
 
 ENTITIES = "entities"
 BILOU_ENTITIES = "bilou_entities"
+NO_ENTITY_TAG = "O"
 
 EXTRACTOR = "extractor"
 
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 6caccf8ddd29..7e5b708dc974 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -13,7 +13,13 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TOKENS_NAMES, TEXT, DENSE_FEATURE_NAMES, ENTITIES
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    TEXT,
+    DENSE_FEATURE_NAMES,
+    ENTITIES,
+    NO_ENTITY_TAG,
+)
 from rasa.constants import (
     DOCS_URL_TRAINING_DATA_NLU,
     DOCS_URL_COMPONENTS,
@@ -327,7 +333,7 @@ def _convert_simple_tagging_to_entity_result(
         for word_idx in range(len(tokens)):
             entity_label, confidence = self.most_likely_entity(word_idx, entities)
             word = tokens[word_idx]
-            if entity_label != "O":
+            if entity_label != NO_ENTITY_TAG:
                 ent = {
                     "start": word.start,
                     "end": word.end,
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index bb63534a3cec..f0331769108c 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -27,6 +27,7 @@
     OPEN_UTTERANCE_PREDICTION_KEY,
     EXTRACTOR,
     PRETRAINED_EXTRACTORS,
+    NO_ENTITY_TAG,
 )
 from rasa.model import get_model
 from rasa.nlu import config, training_data, utils
@@ -682,13 +683,15 @@ def evaluate_entities(
 
     aligned_predictions = align_all_entity_predictions(entity_results, extractors)
     merged_targets = merge_labels(aligned_predictions)
-    merged_targets = substitute_labels(merged_targets, "O", NO_ENTITY)
+    merged_targets = substitute_labels(merged_targets, NO_ENTITY_TAG, NO_ENTITY)
 
     result = {}
 
     for extractor in extractors:
         merged_predictions = merge_labels(aligned_predictions, extractor)
-        merged_predictions = substitute_labels(merged_predictions, "O", NO_ENTITY)
+        merged_predictions = substitute_labels(
+            merged_predictions, NO_ENTITY_TAG, NO_ENTITY
+        )
         logger.info(f"Evaluation for entity extractor: {extractor} ")
         if output_directory:
             report_filename = f"{extractor}_report.json"
@@ -815,7 +818,7 @@ def pick_best_entity_fit(token: Token, candidates: List[Dict]) -> Text:
     """
 
     if len(candidates) == 0:
-        return "O"
+        return NO_ENTITY_TAG
     elif len(candidates) == 1:
         return candidates[0]["entity"]
     else:
@@ -836,7 +839,7 @@ def determine_token_labels(
     """
 
     if entities is None or len(entities) == 0:
-        return "O"
+        return NO_ENTITY_TAG
     if not do_extractors_support_overlap(extractors) and do_entities_overlap(entities):
         raise ValueError("The possible entities should not overlap")
 
@@ -1527,11 +1530,13 @@ def _compute_entity_metrics(
     aligned_predictions = align_all_entity_predictions(entity_results, extractors)
 
     merged_targets = merge_labels(aligned_predictions)
-    merged_targets = substitute_labels(merged_targets, "O", NO_ENTITY)
+    merged_targets = substitute_labels(merged_targets, NO_ENTITY_TAG, NO_ENTITY)
 
     for extractor in extractors:
         merged_predictions = merge_labels(aligned_predictions, extractor)
-        merged_predictions = substitute_labels(merged_predictions, "O", NO_ENTITY)
+        merged_predictions = substitute_labels(
+            merged_predictions, NO_ENTITY_TAG, NO_ENTITY
+        )
         _, precision, f1, accuracy = get_evaluation_metrics(
             merged_targets, merged_predictions, exclude_label=NO_ENTITY
         )
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 8a085745cba0..712127ced134 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -3,7 +3,13 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.constants import ENTITIES, TOKENS_NAMES, TEXT, BILOU_ENTITIES
+from rasa.nlu.constants import (
+    ENTITIES,
+    TOKENS_NAMES,
+    TEXT,
+    BILOU_ENTITIES,
+    NO_ENTITY_TAG,
+)
 
 BILOU_PREFIXES = ["B-", "I-", "U-", "L-"]
 
@@ -29,11 +35,11 @@ def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
 
     if message.get(BILOU_ENTITIES):
         _tags = [
-            tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict["O"]
+            tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict[NO_ENTITY_TAG]
             for _tag in message.get(BILOU_ENTITIES)
         ]
     else:
-        _tags = [tag_id_dict["O"] for _ in message.get(TOKENS_NAMES[TEXT])]
+        _tags = [tag_id_dict[NO_ENTITY_TAG] for _ in message.get(TOKENS_NAMES[TEXT])]
 
     return _tags
 
@@ -54,16 +60,16 @@ def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
             if example.get(BILOU_ENTITIES)
             for e in example.get(BILOU_ENTITIES)
         ]
-    ) - {"O"}
+    ) - {NO_ENTITY_TAG}
 
     tag_id_dict = {
         f"{prefix}{tag}": idx_1 * len(BILOU_PREFIXES) + idx_2 + 1
         for idx_1, tag in enumerate(sorted(distinct_tags))
         for idx_2, prefix in enumerate(BILOU_PREFIXES)
     }
-    # "O" corresponds to non-entity which should correspond to 0 index
+    # NO_ENTITY_TAG corresponds to non-entity which should correspond to 0 index
     # needed for correct prediction for padding
-    tag_id_dict["O"] = 0
+    tag_id_dict[NO_ENTITY_TAG] = 0
 
     return tag_id_dict
 
@@ -93,7 +99,9 @@ def convert_entity(entity):
 
 
 def bilou_tags_from_offsets(
-    tokens: List[Token], entities: List[Tuple[int, int, Text]], missing: Text = "O"
+    tokens: List[Token],
+    entities: List[Tuple[int, int, Text]],
+    missing: Text = NO_ENTITY_TAG,
 ) -> List[Text]:
     """Creates a list of BILOU tags for the given list of tokens and entities."""
 

From bb2b6cbfceec949d5006fb4a831bb378772f3972 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 16:41:55 +0100
Subject: [PATCH 606/633] add type to tf_layers

---
 rasa/core/policies/ted_policy.py        | 2 +-
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 4be295083ce3..26eb88d2e8e1 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -484,7 +484,7 @@ def __init__(
         self.metrics_to_log += ["loss", "acc"]
 
         # set up tf layers
-        self._tf_layers = {}
+        self._tf_layers: Dict[Text : tf.keras.layers.Layer] = {}
         self._prepare_layers()
 
     def _check_data(self) -> None:
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index fa6b237dac99..b5ba6fd0d0e9 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -969,7 +969,7 @@ def __init__(
         )
 
         # tf objects
-        self._tf_layers = {}
+        self._tf_layers: Dict[Text : tf.keras.layers.Layer] = {}
         self._prepare_layers()
 
         # tf training

From d1aa219b79cc486d8b40d6f260bb8e5088904e93 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 16:49:54 +0100
Subject: [PATCH 607/633] update constants comment

---
 rasa/utils/tensorflow/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 31a41dc1009a..97016c54844f 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -1,4 +1,4 @@
-# constants - configuration parameters
+# constants for configuration parameters of our tensorflow models
 
 LABEL = "label"
 HIDDEN_LAYERS_SIZES = "hidden_layers_sizes"

From 12bdf879efb9d6821eec43a24d74ab7ab8b14adb Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 18:53:07 +0100
Subject: [PATCH 608/633] remove magic numbers probs

---
 docs/conf.py                            |   1 +
 rasa/core/policies/ted_policy.py        |   4 +-
 rasa/nlu/classifiers/diet_classifier.py |   6 +-
 rasa/nlu/tokenizers/spacy_tokenizer.py  |   7 +-
 rasa/utils/tensorflow/layers.py         | 103 +++++++++++++++---------
 5 files changed, 69 insertions(+), 52 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 8b08d499c0ae..63ccfeb2bee1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -366,6 +366,7 @@
     ("py:class", "typing.Optional"),
     ("py:class", "typing.Generator"),
     ("py:class", "typing.Iterator"),
+    ("py:class", "typing.Type"),
     ("py:class", "collections.deque"),
     ("py:class", "sanic.app.Sanic"),
     ("py:data", "typing.Any"),
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 26eb88d2e8e1..62f72b33360e 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -197,8 +197,8 @@ def __init__(
 
         self.model = model
 
-        self._label_data = None  # RasaModelData
-        self.data_example = None  # Dict[Text, List[np.ndarray]]
+        self._label_data: Optional[RasaModelData] = None
+        self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = copy.deepcopy(self.defaults)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b5ba6fd0d0e9..c0318984f0d0 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -285,12 +285,8 @@ def __init__(
         # keep the input tuple sizes in self.batch_in
         self.batch_tuple_sizes = batch_tuple_sizes
 
-        # encode all label_ids with numbers
+        self.num_tags: Optional[int] = None  # number of entity tags
         self._label_data: Optional[RasaModelData] = None
-
-        # number of entity tags
-        self.num_tags: Optional[int] = None
-
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
     @property
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 5e05aa88e219..58368b48aaf7 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -11,11 +11,6 @@
 if typing.TYPE_CHECKING:
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
-try:
-    import spacy
-except ImportError:
-    spacy = None
-
 
 POS_TAG_KEY = "pos"
 
@@ -47,6 +42,8 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
 
     @staticmethod
     def _tag_of_token(token: Any) -> Text:
+        import spacy
+
         if spacy.about.__version__ > "2" and token._.has("tag"):
             return token._.get("tag")
         else:
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index f1f2a5ed9932..b2fb4d84eb0b 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional, Text, Tuple, Callable, Union
+from typing import List, Optional, Text, Tuple, Callable, Union, Any
 import tensorflow as tf
 import tensorflow_addons as tfa
 from tensorflow.python.keras.utils import tf_utils
@@ -26,6 +26,7 @@ def dropped_inputs() -> tf.Tensor:
         outputs = tf_utils.smart_cond(
             training, dropped_inputs, lambda: tf.identity(inputs)
         )
+        # need to explicitly set shape, because it becomes dynamic after `retain`
         # noinspection PyProtectedMember
         outputs._dense_shape = inputs._dense_shape
 
@@ -35,7 +36,7 @@ def dropped_inputs() -> tf.Tensor:
 class DenseForSparse(tf.keras.layers.Dense):
     """Dense layer for sparse input tensor."""
 
-    def __init__(self, reg_lambda: float = 0, **kwargs) -> None:
+    def __init__(self, reg_lambda: float = 0, **kwargs: Any) -> None:
         if reg_lambda > 0:
             regularizer = tf.keras.regularizers.l2(reg_lambda)
         else:
@@ -117,9 +118,6 @@ def __init__(
     def call(
         self, x: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None
     ) -> tf.Tensor:
-        if training is None:
-            training = K.learning_phase()
-
         for layer in self._ffn_layers:
             x = layer(x, training=training)
 
@@ -142,7 +140,7 @@ def __init__(
         if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
             raise ValueError(
                 f"Wrong similarity type '{self.similarity_type}', "
-                f"should be '{COSINE}' or '{INNER}'"
+                f"should be '{COSINE}' or '{INNER}'."
             )
 
         regularizer = tf.keras.regularizers.l2(reg_lambda)
@@ -156,12 +154,19 @@ def __init__(
     def call(self, x: tf.Tensor) -> tf.Tensor:
         x = self._dense(x)
         if self.similarity_type == COSINE:
-            x = tf.nn.l2_normalize(x, -1)
+            x = tf.nn.l2_normalize(x, axis=-1)
 
         return x
 
 
 class InputMask(tf.keras.layers.Layer):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        self._masking_prob = 0.85
+        self._mask_vector_prob = 0.7
+        self._random_vector_prob = 0.1
+
     def build(self, input_shape: tf.TensorShape) -> None:
         self.mask_vector = self.add_weight(
             shape=(1, 1, input_shape[-1]), name="mask_vector"
@@ -180,9 +185,9 @@ def call(
             training = K.learning_phase()
 
         lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
-        lm_mask_bool = tf.greater_equal(lm_mask_prob, 0.85)
+        lm_mask_bool = tf.greater_equal(lm_mask_prob, self._masking_prob)
 
-        def x_masked():
+        def x_masked() -> tf.Tensor:
             x_random_pad = tf.random.uniform(
                 tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype
             ) * (1 - mask)
@@ -202,9 +207,13 @@ def x_masked():
             other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
             other_prob = tf.tile(other_prob, (1, 1, x.shape[-1]))
             x_other = tf.where(
-                other_prob < 0.70,
+                other_prob < self._mask_vector_prob,
                 mask_vector,
-                tf.where(other_prob < 0.80, x_shuffle, x),
+                tf.where(
+                    other_prob < self._mask_vector_prob + self._random_vector_prob,
+                    x_shuffle,
+                    x,
+                ),
             )
 
             return tf.where(tf.tile(lm_mask_bool, (1, 1, x.shape[-1])), x_other, x)
@@ -216,16 +225,18 @@ def x_masked():
 
 
 class CRF(tf.keras.layers.Layer):
-    def __init__(self, num_tags: int, reg_lambda: float, name: Text = None) -> None:
+    def __init__(
+        self, num_tags: int, reg_lambda: float, name: Optional[Text] = None
+    ) -> None:
         super().__init__(name=name)
         self.num_tags = num_tags
-        self.regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self.transition_regularizer = tf.keras.regularizers.l2(reg_lambda)
 
     def build(self, input_shape: tf.TensorShape) -> None:
-        # should be created in `build` to apply random_seed
+        # the weights should be created in `build` to apply random_seed
         self.transition_params = self.add_weight(
             shape=(self.num_tags, self.num_tags),
-            regularizer=self.regularizer,
+            regularizer=self.transition_regularizer,
             name="transitions",
         )
         self.built = True
@@ -260,7 +271,7 @@ def __init__(
         use_max_sim_neg: bool,
         neg_lambda: float,
         scale_loss: bool,
-        name: Text = None,
+        name: Optional[Text] = None,
         parallel_iterations: int = 1000,
         same_sampling: bool = False,
     ) -> None:
@@ -281,8 +292,10 @@ def _make_flat(x: tf.Tensor) -> tf.Tensor:
 
         return tf.reshape(x, (-1, x.shape[-1]))
 
-    def _random_indices(self, batch_size: tf.Tensor, total_candidates: tf.Tensor):
-        def rand_idxs():
+    def _random_indices(
+        self, batch_size: tf.Tensor, total_candidates: tf.Tensor
+    ) -> tf.Tensor:
+        def rand_idxs() -> tf.Tensor:
             """Create random tensor of indices"""
 
             # (1, num_neg)
@@ -293,29 +306,29 @@ def rand_idxs():
         if self.same_sampling:
             return tf.tile(rand_idxs(), (batch_size, 1))
 
-        def cond(i, out):
+        def cond(idx: tf.Tensor, out: tf.Tensor) -> tf.Tensor:
             """Condition for while loop"""
-            return i < batch_size
+            return idx < batch_size
 
-        def body(i, out):
+        def body(idx: tf.Tensor, out: tf.Tensor) -> List[tf.Tensor]:
             """Body of the while loop"""
             return [
                 # increment counter
-                i + 1,
+                idx + 1,
                 # add random indices
                 tf.concat([out, rand_idxs()], 0),
             ]
 
         # first tensor already created
-        i1 = tf.constant(1)
+        idx1 = tf.constant(1)
         # create first random array of indices
         out1 = rand_idxs()  # (1, num_neg)
 
         return tf.while_loop(
             cond,
             body,
-            loop_vars=[i1, out1],
-            shape_invariants=[i1.shape, tf.TensorShape([None, self.num_neg])],
+            loop_vars=[idx1, out1],
+            shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
             parallel_iterations=self.parallel_iterations,
             back_prop=False,
         )[1]
@@ -336,7 +349,7 @@ def _get_bad_mask(
         Checks that input features are different for positive negative samples.
         """
 
-        pos_labels = tf.expand_dims(target_labels, -2)
+        pos_labels = tf.expand_dims(target_labels, axis=-2)
         neg_labels = self._sample_idxs(tf.shape(target_labels)[0], labels, idxs)
 
         return tf.cast(
@@ -361,6 +374,8 @@ def _get_negs(
         bad_negs = self._get_bad_mask(labels_flat, target_labels_flat, neg_ids)
 
         if len(target_labels.shape) == 3:
+            # tensors were flattened for sampling, reshape back
+            # add sequence dimension if it was present in the inputs
             target_shape = tf.shape(target_labels)
             neg_embeds = tf.reshape(
                 neg_embeds, (target_shape[0], target_shape[1], -1, embeds.shape[-1])
@@ -379,8 +394,8 @@ def _sample_negatives(
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
         """Sample negative examples."""
 
-        pos_inputs_embed = tf.expand_dims(inputs_embed, -2)
-        pos_labels_embed = tf.expand_dims(labels_embed, -2)
+        pos_inputs_embed = tf.expand_dims(inputs_embed, axis=-2)
+        pos_labels_embed = tf.expand_dims(labels_embed, axis=-2)
 
         # sample negative inputs
         neg_inputs_embed, inputs_bad_negs = self._get_negs(inputs_embed, labels, labels)
@@ -401,7 +416,7 @@ def _sample_negatives(
     def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
         """Calculate similarity between given tensors."""
 
-        sim = tf.reduce_sum(a * b, -1)
+        sim = tf.reduce_sum(a * b, axis=-1)
         if mask is not None:
             sim *= tf.expand_dims(mask, 2)
 
@@ -458,9 +473,11 @@ def _train_sim(
     def _calc_accuracy(sim_pos: tf.Tensor, sim_neg: tf.Tensor) -> tf.Tensor:
         """Calculate accuracy."""
 
-        max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
+        max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], axis=-1), axis=-1)
         return tf.reduce_mean(
-            tf.cast(tf.math.equal(max_all_sim, tf.squeeze(sim_pos, -1)), tf.float32)
+            tf.cast(
+                tf.math.equal(max_all_sim, tf.squeeze(sim_pos, axis=-1)), tf.float32
+            )
         )
 
     def _loss_margin(
@@ -475,35 +492,41 @@ def _loss_margin(
         """Define max margin loss."""
 
         # loss for maximizing similarity with correct action
-        loss = tf.maximum(0.0, self.mu_pos - tf.squeeze(sim_pos, -1))
+        loss = tf.maximum(0.0, self.mu_pos - tf.squeeze(sim_pos, axis=-1))
 
         # loss for minimizing similarity with `num_neg` incorrect actions
         if self.use_max_sim_neg:
             # minimize only maximum similarity over incorrect actions
-            max_sim_neg_il = tf.reduce_max(sim_neg_il, -1)
+            max_sim_neg_il = tf.reduce_max(sim_neg_il, axis=-1)
             loss += tf.maximum(0.0, self.mu_neg + max_sim_neg_il)
         else:
             # minimize all similarities with incorrect actions
             max_margin = tf.maximum(0.0, self.mu_neg + sim_neg_il)
-            loss += tf.reduce_sum(max_margin, -1)
+            loss += tf.reduce_sum(max_margin, axis=-1)
 
         # penalize max similarity between pos bot and neg bot embeddings
-        max_sim_neg_ll = tf.maximum(0.0, self.mu_neg + tf.reduce_max(sim_neg_ll, -1))
+        max_sim_neg_ll = tf.maximum(
+            0.0, self.mu_neg + tf.reduce_max(sim_neg_ll, axis=-1)
+        )
         loss += max_sim_neg_ll * self.neg_lambda
 
         # penalize max similarity between pos dial and neg dial embeddings
-        max_sim_neg_ii = tf.maximum(0.0, self.mu_neg + tf.reduce_max(sim_neg_ii, -1))
+        max_sim_neg_ii = tf.maximum(
+            0.0, self.mu_neg + tf.reduce_max(sim_neg_ii, axis=-1)
+        )
         loss += max_sim_neg_ii * self.neg_lambda
 
         # penalize max similarity between pos bot and neg dial embeddings
-        max_sim_neg_li = tf.maximum(0.0, self.mu_neg + tf.reduce_max(sim_neg_li, -1))
+        max_sim_neg_li = tf.maximum(
+            0.0, self.mu_neg + tf.reduce_max(sim_neg_li, axis=-1)
+        )
         loss += max_sim_neg_li * self.neg_lambda
 
         if mask is not None:
             # mask loss for different length sequences
             loss *= mask
             # average the loss over sequence length
-            loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1)
+            loss = tf.reduce_sum(loss, axis=-1) / tf.reduce_sum(mask, axis=1)
 
         # average the loss over the batch
         loss = tf.reduce_mean(loss)
@@ -522,7 +545,7 @@ def _loss_softmax(
         """Define softmax loss."""
 
         logits = tf.concat(
-            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], -1
+            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
         )
 
         # create label_ids for softmax
@@ -544,7 +567,7 @@ def _loss_softmax(
 
         if len(loss.shape) == 2:
             # average over the sequence
-            loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, -1)
+            loss = tf.reduce_sum(loss, axis=-1) / tf.reduce_sum(mask, axis=-1)
 
         # average the loss over all examples
         loss = tf.reduce_mean(loss)

From 4eda2e5fde9952d84a36ef6edc1cb29a4b791e2e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 19:00:05 +0100
Subject: [PATCH 609/633] fix type of Data in model data

---
 rasa/utils/tensorflow/model_data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 7e4da6ccab54..a0bf93c55183 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -12,7 +12,7 @@
 logger = logging.getLogger(__name__)
 
 
-Data = Optional[Dict[Text, List[np.ndarray]]]
+Data = Dict[Text, List[np.ndarray]]
 
 
 class FeatureSignature(NamedTuple):
@@ -21,7 +21,7 @@ class FeatureSignature(NamedTuple):
 
 
 class RasaModelData:
-    def __init__(self, label_key: Optional[Text] = None, data: Data = None):
+    def __init__(self, label_key: Optional[Text] = None, data: Optional[Data] = None):
         self.data = data or {}
         self.label_key = label_key
         # will be updated when features are added
@@ -401,7 +401,7 @@ def _check_train_test_sizes(
             )
 
     @staticmethod
-    def _data_for_ids(data: Data, ids: np.ndarray) -> Dict[Text, List[np.ndarray]]:
+    def _data_for_ids(data: Data, ids: np.ndarray) -> Data:
         """Filter session data by ids."""
 
         new_data = defaultdict(list)

From f1f6c43cde218d3687ebd110959afe4aa97016d1 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 19:16:14 +0100
Subject: [PATCH 610/633] add axis=

---
 rasa/core/policies/ted_policy.py        |  2 +-
 rasa/nlu/classifiers/diet_classifier.py |  4 ++--
 rasa/utils/tensorflow/transformer.py    | 16 ++++++----------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 62f72b33360e..1f031fa39085 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -564,7 +564,7 @@ def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor
 
         # mask different length sequences
         # if there is at least one `-1` it should be masked
-        mask = tf.sign(tf.reduce_max(dialogue_in, -1) + 1)
+        mask = tf.sign(tf.reduce_max(dialogue_in, axis=-1) + 1)
 
         dialogue = self._tf_layers[f"ffnn.{DIALOGUE}"](dialogue_in, self._training)
         dialogue_transformed = self._tf_layers["transformer"](
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c0318984f0d0..efe40a269102 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1152,7 +1152,7 @@ def _prepare_entity_recognition_layers(self) -> None:
 
     @staticmethod
     def _get_sequence_lengths(mask: tf.Tensor) -> tf.Tensor:
-        return tf.cast(tf.reduce_sum(mask[:, :, 0], 1), tf.int32)
+        return tf.cast(tf.reduce_sum(mask[:, :, 0], axis=1), tf.int32)
 
     def _combine_sparse_dense_features(
         self,
@@ -1359,7 +1359,7 @@ def batch_loss(
 
         if self.config[MASKED_LM]:
             loss, acc = self._mask_loss(
-                text_transformed, text_in, text_seq_ids, lm_mask_bool_text, "text"
+                text_transformed, text_in, text_seq_ids, lm_mask_bool_text, TEXT
             )
             self.mask_loss.update_state(loss)
             self.mask_acc.update_state(acc)
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 9903e7ce5688..853093364ebe 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -302,12 +302,10 @@ def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
           a Tensor with shape [batch, length, channels]
         """
 
-        x = tf.transpose(
-            x, perm=[0, 2, 1, 3]
-        )  # (batch_size, seq_len_q, num_heads, depth)
-        return tf.reshape(
-            x, (tf.shape(x)[0], -1, self.units)
-        )  # (batch_size, seq_len_q, units)
+        # (batch_size, seq_len_q, num_heads, depth)
+        x = tf.transpose(x, perm=[0, 2, 1, 3])
+        # (batch_size, seq_len_q, units)
+        return tf.reshape(x, (tf.shape(x)[0], -1, self.units))
 
     # noinspection PyMethodOverriding
     def call(
@@ -469,7 +467,7 @@ def _positional_encoding(self, max_position: tf.Tensor) -> tf.Tensor:
         angle_rads = tf.range(max_position)[:, tf.newaxis] * self._angles
 
         # transpose for easy slicing
-        angle_rads = tf.transpose(angle_rads, [1, 0])
+        angle_rads = tf.transpose(angle_rads, perm=[1, 0])
         shape = tf.shape(angle_rads)
         # apply sin to even indices in the array; 2i
         sin_even = tf.sin(tf.gather_nd(angle_rads, self._even_indices))
@@ -478,7 +476,7 @@ def _positional_encoding(self, max_position: tf.Tensor) -> tf.Tensor:
         cos_odd = tf.cos(tf.gather_nd(angle_rads, self._odd_indices))
         pos_encoding_odd = tf.scatter_nd(self._odd_indices, cos_odd, shape)
         # combine even and odd positions and transpose back
-        pos_encoding = tf.transpose(pos_encoding_even + pos_encoding_odd, [1, 0])
+        pos_encoding = tf.transpose(pos_encoding_even + pos_encoding_odd, perm=[1, 0])
         # add batch dimension
         return tf.stop_gradient(pos_encoding[tf.newaxis, ...])
 
@@ -493,8 +491,6 @@ def call(
         pad_mask: Optional[tf.Tensor] = None,
         training: Optional[Union[tf.Tensor, bool]] = None,
     ) -> tf.Tensor:
-        if training is None:
-            training = K.learning_phase()
 
         # adding embedding and position encoding.
         x = self._embedding(x)  # (batch_size, seq_len, units)

From 0542b2894255042ab100d9b6c0b5de7b1450dd11 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 19:28:51 +0100
Subject: [PATCH 611/633] add explanatory comments

---
 rasa/utils/tensorflow/layers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index b2fb4d84eb0b..b25ad97b151f 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -373,6 +373,7 @@ def _get_negs(
         neg_embeds = self._sample_idxs(target_size, embeds_flat, neg_ids)
         bad_negs = self._get_bad_mask(labels_flat, target_labels_flat, neg_ids)
 
+        # check if inputs have sequence dimension
         if len(target_labels.shape) == 3:
             # tensors were flattened for sampling, reshape back
             # add sequence dimension if it was present in the inputs
@@ -561,6 +562,7 @@ def _loss_softmax(
         if self.scale_loss:
             # mask loss by prediction confidence
             pos_pred = tf.stop_gradient(tf.nn.softmax(logits)[..., 0])
+            # the scaling parameters are found empirically
             scale_mask = mask * tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
             # scale loss
             loss *= scale_mask

From 1e8b7b94436e0834aef4dac287b7075abdd6f14a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 22:00:21 +0100
Subject: [PATCH 612/633] check if responses are present.

---
 rasa/nlu/selectors/response_selector.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index ab302426308f..037dfe748e4a 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -258,6 +258,11 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         label_id_index_mapping = self._label_id_index_mapping(
             training_data, attribute=RESPONSE
         )
+
+        if not label_id_index_mapping:
+            # no responses present to train
+            return RasaModelData()
+
         self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
 
         self._label_data = self._create_label_data(

From 937813d39ca8e36df39fc384b3b48354501d6a12 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 25 Feb 2020 22:23:13 +0100
Subject: [PATCH 613/633] review comments

---
 rasa/nlu/utils/bilou_utils.py        | 12 +++++++-----
 rasa/utils/tensorflow/model_data.py  | 12 +++++++-----
 rasa/utils/tensorflow/models.py      | 15 ++++++++++-----
 rasa/utils/tensorflow/transformer.py | 12 ++++++------
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 712127ced134..10c42a30e7a4 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Text, Optional, Dict, Set
+from typing import List, Tuple, Text, Optional, Dict, Set, Any
 
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
@@ -74,7 +74,7 @@ def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
     return tag_id_dict
 
 
-def apply_bilou_schema(training_data: TrainingData):
+def apply_bilou_schema(training_data: TrainingData) -> None:
     """Gets a list of BILOU entity tags and sets them on the corresponding message."""
 
     for message in training_data.training_examples:
@@ -92,7 +92,7 @@ def apply_bilou_schema(training_data: TrainingData):
 def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
     """Maps the entities of the given message to their start, end, and tag values."""
 
-    def convert_entity(entity):
+    def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]:
         return entity["start"], entity["end"], entity["entity"]
 
     return [convert_entity(entity) for entity in message.get(ENTITIES, [])]
@@ -113,7 +113,9 @@ def bilou_tags_from_offsets(
     bilou = ["-" for _ in tokens]
 
     # Handle entity cases
-    _handle_entities(bilou, entities, end_pos_to_token_idx, start_pos_to_token_idx)
+    _add_bilou_tags_to_entities(
+        bilou, entities, end_pos_to_token_idx, start_pos_to_token_idx
+    )
 
     # Now distinguish the O cases from ones where we miss the tokenization
     entity_positions = _get_entity_positions(entities)
@@ -122,7 +124,7 @@ def bilou_tags_from_offsets(
     return bilou
 
 
-def _handle_entities(
+def _add_bilou_tags_to_entities(
     bilou: List[Text],
     entities: List[Tuple[int, int, Text]],
     end_pos_to_token_idx: Dict[int, int],
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index a0bf93c55183..4f6159d29b86 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -21,7 +21,9 @@ class FeatureSignature(NamedTuple):
 
 
 class RasaModelData:
-    def __init__(self, label_key: Optional[Text] = None, data: Optional[Data] = None):
+    def __init__(
+        self, label_key: Optional[Text] = None, data: Optional[Data] = None
+    ) -> None:
         self.data = data or {}
         self.label_key = label_key
         # will be updated when features are added
@@ -52,7 +54,7 @@ def feature_not_exist(self, key: Text) -> bool:
         """Check if feature key is present and features are available."""
         return key not in self.data or not self.data[key]
 
-    def is_empty(self):
+    def is_empty(self) -> bool:
         """Checks if data is set."""
         return not self.data
 
@@ -331,7 +333,7 @@ def _get_shapes_types(self) -> Tuple:
         types = []
         shapes = []
 
-        def append_shape(features: np.ndarray):
+        def append_shape(features: np.ndarray) -> None:
             if isinstance(features[0], scipy.sparse.spmatrix):
                 # scipy matrix is converted into indices, data, shape
                 shapes.append((None, features[0].ndim + 1))
@@ -344,7 +346,7 @@ def append_shape(features: np.ndarray):
             else:
                 shapes.append((None, None, features[0].shape[-1]))
 
-        def append_type(features: np.ndarray):
+        def append_type(features: np.ndarray) -> None:
             if isinstance(features[0], scipy.sparse.spmatrix):
                 # scipy matrix is converted into indices, data, shape
                 types.append(tf.int64)
@@ -532,7 +534,7 @@ def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
 
     @staticmethod
     def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
-        """Convert a scipy matrix into inidces, data, and shape."""
+        """Convert a scipy matrix into indices, data, and shape."""
 
         if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
             array_of_sparse = [x.tocoo() for x in array_of_sparse]
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 27680a9a2d4d..1c7c4066e21c 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -82,21 +82,26 @@ def fit(
         val_results = {}  # validation is not performed every epoch
         progress_bar = tqdm(range(epochs), desc="Epochs", disable=disable)
 
-        for ep in progress_bar:
-            ep_batch_size = self.linearly_increasing_batch_size(ep, batch_size, epochs)
+        for epoch in progress_bar:
+            epoch_batch_size = self.linearly_increasing_batch_size(
+                epoch, batch_size, epochs
+            )
 
             self._batch_loop(
-                train_dataset_function, tf_train_on_batch_function, ep_batch_size, True
+                train_dataset_function,
+                tf_train_on_batch_function,
+                epoch_batch_size,
+                True,
             )
 
             postfix_dict = self._get_metric_results()
 
             if evaluate_on_num_examples > 0:
-                if self._should_evaluate(evaluate_every_num_epochs, epochs, ep):
+                if self._should_evaluate(evaluate_every_num_epochs, epochs, epoch):
                     self._batch_loop(
                         evaluation_dataset_function,
                         tf_evaluation_on_batch_function,
-                        ep_batch_size,
+                        epoch_batch_size,
                         False,
                     )
                     val_results = self._get_metric_results(prefix="val_")
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 853093364ebe..92426769f650 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -87,12 +87,12 @@ def _create_relative_embeddings(self) -> None:
 
         if self.use_key_relative_position:
             self.key_relative_embeddings = self.add_weight(
-                shape=relative_embedding_shape, name="key_relative_embeddings",
+                shape=relative_embedding_shape, name="key_relative_embeddings"
             )
 
         if self.use_value_relative_position:
             self.value_relative_embeddings = self.add_weight(
-                shape=relative_embedding_shape, name="value_relative_embeddings",
+                shape=relative_embedding_shape, name="value_relative_embeddings"
             )
 
     def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
@@ -356,7 +356,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self._layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
         self._mha = MultiHeadAttention(
             units,
             num_heads,
@@ -391,7 +391,7 @@ def call(
         if training is None:
             training = K.learning_phase()
 
-        x_norm = self._layernorm(x)  # (batch_size, seq_len, units)
+        x_norm = self._layer_norm(x)  # (batch_size, seq_len, units)
         attn_out, _ = self._mha(
             x_norm, x_norm, x_norm, pad_mask=pad_mask, training=training
         )
@@ -456,7 +456,7 @@ def __init__(
             )
             for _ in range(num_layers)
         ]
-        self._layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self._layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
 
     def _get_angles(self) -> np.ndarray:
         i = np.arange(self.units)[np.newaxis, :]
@@ -514,4 +514,4 @@ def call(
         # if normalization is done in encoding layers, then it should also be done
         # on the output, since the output can grow very large, being the sum of
         # a whole stack of unnormalized layer outputs.
-        return self._layernorm(x)  # (batch_size, seq_len, units)
+        return self._layer_norm(x)  # (batch_size, seq_len, units)

From 2886ea0f3885554bfbf9430ec11b8f840f6ce24f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 25 Feb 2020 23:41:24 +0100
Subject: [PATCH 614/633] add comment and type

---
 rasa/utils/tensorflow/model_data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index a0bf93c55183..9419f40936b6 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -21,7 +21,9 @@ class FeatureSignature(NamedTuple):
 
 
 class RasaModelData:
-    def __init__(self, label_key: Optional[Text] = None, data: Optional[Data] = None):
+    def __init__(
+        self, label_key: Optional[Text] = None, data: Optional[Data] = None
+    ) -> None:
         self.data = data or {}
         self.label_key = label_key
         # will be updated when features are added

From 483713b4ed6aa300649966d03d71ce377d6f8830 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 00:10:22 +0100
Subject: [PATCH 615/633] rename relative lengths

---
 rasa/utils/tensorflow/transformer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 92426769f650..4cbdfcb0d75b 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -102,12 +102,12 @@ def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor
 
         # pad the right side to length
         if self.unidirectional:
-            m_right = 1  # current time
+            right_relative_length = 1  # current time
             pad_right = tf.zeros_like(x[:, :, :, -1:, :])
         else:
-            m_right = self.relative_length
+            right_relative_length = self.relative_length
             pad_right = x[:, :, :, -1:, :]
-        pad_right = tf.tile(pad_right, (1, 1, 1, length - m_right, 1))
+        pad_right = tf.tile(pad_right, (1, 1, 1, length - right_relative_length, 1))
 
         return tf.concat([pad_left, x, pad_right], axis=-2)
 
@@ -118,9 +118,9 @@ def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tens
             pad_right = tf.tile(pad_right, (1, 1, 1, self.relative_length - 1, 1))
             x = tf.concat([x, pad_right], axis=-2)
 
-        dl = self.relative_length - length
-        m = tf.shape(x)[-2]
-        return x[:, :, :, dl : m - dl, :]
+        extra_length = self.relative_length - length
+        full_length = tf.shape(x)[-2]
+        return x[:, :, :, extra_length : full_length - extra_length, :]
 
     def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
         """Universal method to convert tensor from relative to absolute indexing.

From c6e6f271a9a8026ff531d0b789ed38cc997c7b21 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 08:38:25 +0100
Subject: [PATCH 616/633] remove batch_tuple_sizes

---
 rasa/core/policies/ted_policy.py              |  2 +-
 rasa/nlu/classifiers/diet_classifier.py       | 26 +++++++------------
 .../embedding_intent_classifier.py            |  7 +----
 rasa/nlu/selectors/response_selector.py       |  9 ++-----
 rasa/utils/tensorflow/model_data.py           | 18 -------------
 tests/nlu/conftest.py                         | 12 ++++-----
 6 files changed, 19 insertions(+), 55 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 1f031fa39085..41963b40315c 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -186,7 +186,7 @@ def __init__(
         model: Optional[RasaModel] = None,
         **kwargs: Dict[Text, Any],
     ) -> None:
-        """Declare instance variables with default values"""
+        """Declare instance variables with default values."""
 
         if not featurizer:
             featurizer = self._standard_featurizer(max_history)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index efe40a269102..c3e402ee7167 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -262,7 +262,6 @@ def __init__(
         index_label_id_mapping: Optional[Dict[int, Text]] = None,
         index_tag_id_mapping: Optional[Dict[int, Text]] = None,
         model: Optional[RasaModel] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
         """Declare instance variables with default values."""
 
@@ -282,9 +281,6 @@ def __init__(
 
         self.model = model
 
-        # keep the input tuple sizes in self.batch_in
-        self.batch_tuple_sizes = batch_tuple_sizes
-
         self.num_tags: Optional[int] = None  # number of entity tags
         self._label_data: Optional[RasaModelData] = None
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
@@ -568,6 +564,11 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         label_id_index_mapping = self._label_id_index_mapping(
             training_data, attribute=INTENT
         )
+
+        if not label_id_index_mapping:
+            # no responses present to train
+            return RasaModelData()
+
         self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
 
         self._label_data = self._create_label_data(
@@ -809,9 +810,6 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             model_dir / f"{file_name}.index_tag_id_mapping.pkl",
             self.index_tag_id_mapping,
         )
-        io_utils.json_pickle(
-            model_dir / f"{file_name}.batch_tuple_sizes.pkl", self.batch_tuple_sizes
-        )
 
         return {"file": file_name}
 
@@ -834,7 +832,6 @@ def load(
             return cls(component_config=meta)
 
         (
-            batch_tuple_sizes,
             index_label_id_mapping,
             index_tag_id_mapping,
             label_data,
@@ -853,7 +850,6 @@ def load(
             index_label_id_mapping=index_label_id_mapping,
             index_tag_id_mapping=index_tag_id_mapping,
             model=model,
-            batch_tuple_sizes=batch_tuple_sizes,
         )
 
     @classmethod
@@ -870,9 +866,6 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
         index_tag_id_mapping = io_utils.json_unpickle(
             model_dir / f"{file_name}.index_tag_id_mapping.pkl"
         )
-        batch_tuple_sizes = io_utils.json_unpickle(
-            model_dir / f"{file_name}.batch_tuple_sizes.pkl"
-        )
 
         # jsonpickle converts dictionary keys to strings
         index_label_id_mapping = {
@@ -884,7 +877,6 @@ def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
             }
 
         return (
-            batch_tuple_sizes,
             index_label_id_mapping,
             index_tag_id_mapping,
             label_data,
@@ -1300,7 +1292,7 @@ def _mask_loss(
             outputs_embed, inputs_embed, ids, inputs_embed, ids
         )
 
-    def _label_loss(
+    def _calculate_label_loss(
         self, a: tf.Tensor, b: tf.Tensor, label_ids: tf.Tensor
     ) -> tf.Tensor:
         all_label_ids, all_labels_embed = self._create_all_labels()
@@ -1312,7 +1304,7 @@ def _label_loss(
             a_embed, b_embed, label_ids, all_labels_embed, all_label_ids
         )
 
-    def _entity_loss(
+    def _calculate_entity_loss(
         self,
         outputs: tf.Tensor,
         tag_ids: tf.Tensor,
@@ -1375,7 +1367,7 @@ def batch_loss(
                 tf_batch_data[LABEL_MASK][0],
                 self.label_name,
             )
-            loss, acc = self._label_loss(cls, label, label_ids)
+            loss, acc = self._calculate_label_loss(cls, label, label_ids)
             self.intent_loss.update_state(loss)
             self.response_acc.update_state(acc)
             losses.append(loss)
@@ -1383,7 +1375,7 @@ def batch_loss(
         if self.config[ENTITY_RECOGNITION]:
             tag_ids = tf_batch_data[TAG_IDS][0]
 
-            loss, f1 = self._entity_loss(
+            loss, f1 = self._calculate_entity_loss(
                 text_transformed, tag_ids, mask_text, sequence_lengths
             )
             self.entity_loss.update_state(loss)
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ed17bf4b8081..43b485df5e3c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -140,7 +140,6 @@ def __init__(
         index_label_id_mapping: Optional[Dict[int, Text]] = None,
         index_tag_id_mapping: Optional[Dict[int, Text]] = None,
         model: Optional[RasaModel] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
 
         component_config = component_config or {}
@@ -153,11 +152,7 @@ def __init__(
         component_config[NUM_TRANSFORMER_LAYERS] = 0
 
         super().__init__(
-            component_config,
-            index_label_id_mapping,
-            index_tag_id_mapping,
-            model,
-            batch_tuple_sizes,
+            component_config, index_label_id_mapping, index_tag_id_mapping, model
         )
 
         common_utils.raise_warning(
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 037dfe748e4a..ed6705c2f70a 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -194,7 +194,6 @@ def __init__(
         index_label_id_mapping: Optional[Dict[int, Text]] = None,
         index_tag_id_mapping: Optional[Dict[int, Text]] = None,
         model: Optional[RasaModel] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
 
         component_config = component_config or {}
@@ -205,11 +204,7 @@ def __init__(
         component_config[BILOU_FLAG] = None
 
         super().__init__(
-            component_config,
-            index_label_id_mapping,
-            index_tag_id_mapping,
-            model,
-            batch_tuple_sizes,
+            component_config, index_label_id_mapping, index_tag_id_mapping, model
         )
 
     @property
@@ -410,7 +405,7 @@ def batch_loss(
         cls_label = self._last_token(label_transformed, sequence_lengths_label)
         label_ids = tf_batch_data[LABEL_IDS][0]
 
-        loss, acc = self._label_loss(cls_text, cls_label, label_ids)
+        loss, acc = self._calculate_label_loss(cls_text, cls_label, label_ids)
         self.response_loss.update_state(loss)
         self.response_acc.update_state(acc)
         losses.append(loss)
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 4f6159d29b86..10dcbd5bccae 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -295,24 +295,6 @@ def prepare_batch(
         # len of batch_data is equal to the number of keys in session data
         return tuple(batch_data)
 
-    def batch_tuple_sizes(self) -> Dict[Text, int]:
-
-        # save the amount of placeholders attributed to session data keys
-        tuple_sizes = defaultdict(int)
-
-        idx = 0
-        for k, values in self.data.items():
-            tuple_sizes[k] = 0
-            for v in values:
-                if isinstance(v[0], scipy.sparse.spmatrix):
-                    tuple_sizes[k] += 3
-                    idx += 3
-                else:
-                    tuple_sizes[k] += 1
-                    idx += 1
-
-        return tuple_sizes
-
     def as_tf_dataset(
         self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
     ) -> tf.data.Dataset:
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index b3c564698ece..9c644930e5bd 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -45,9 +45,9 @@ def config_path() -> Text:
             "language": "en",
             "pipeline": [
                 {"name": "WhitespaceTokenizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 2, RANDOM_SEED: 42},
+                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
                 {"name": "CountVectorsFeaturizer"},
-                {"name": "EmbeddingIntentClassifier", EPOCHS: 2, RANDOM_SEED: 42},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 1, RANDOM_SEED: 42},
             ],
         }
     ).name
@@ -63,7 +63,7 @@ def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
                 {"name": "SpacyTokenizer"},
                 {"name": "SpacyFeaturizer"},
                 {"name": "RegexFeaturizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 3, RANDOM_SEED: 42},
+                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
                 {"name": "EntitySynonymMapper"},
                 {"name": "SklearnIntentClassifier"},
             ],
@@ -79,7 +79,7 @@ def supervised_embeddings_config() -> RasaNLUModelConfig:
             "pipeline": [
                 {"name": "WhitespaceTokenizer"},
                 {"name": "RegexFeaturizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 3, RANDOM_SEED: 42},
+                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
                 {"name": "EntitySynonymMapper"},
                 {"name": "CountVectorsFeaturizer"},
                 {
@@ -88,7 +88,7 @@ def supervised_embeddings_config() -> RasaNLUModelConfig:
                     "min_ngram": 1,
                     "max_ngram": 4,
                 },
-                {"name": "EmbeddingIntentClassifier", EPOCHS: 3, RANDOM_SEED: 42},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 1, RANDOM_SEED: 42},
             ],
         }
     )
@@ -102,7 +102,7 @@ def pretrained_embeddings_convert_config() -> RasaNLUModelConfig:
             "pipeline": [
                 {"name": "ConveRTTokenizer"},
                 {"name": "ConveRTFeaturizer"},
-                {"name": "EmbeddingIntentClassifier", EPOCHS: 3, RANDOM_SEED: 42},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 1, RANDOM_SEED: 42},
             ],
         }
     )

From 502ef228ab1c30998b90e5620c3ec56c3df25806 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 09:26:05 +0100
Subject: [PATCH 617/633] review comments

---
 rasa/nlu/classifiers/diet_classifier.py       |  3 +
 .../dense_featurizer/mitie_featurizer.py      |  3 +-
 .../dense_featurizer/spacy_featurizer.py      |  5 +-
 rasa/nlu/featurizers/featurizer.py            |  8 ++-
 rasa/nlu/utils/bilou_utils.py                 | 58 ++++++++++++++++---
 .../nlu/utils/hugging_face/hf_transformers.py |  8 ++-
 rasa/utils/tensorflow/constants.py            |  4 ++
 rasa/utils/tensorflow/model_data.py           |  6 ++
 8 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c3e402ee7167..fa59cac40755 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1318,6 +1318,7 @@ def _calculate_entity_loss(
 
         # should call first to build weights
         pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
+        # pytype cannot infer that 'self._tf_layers["crf"]' has the method '.loss'
         # pytype: disable=attribute-error
         loss = self._tf_layers["crf"].loss(logits, tag_ids, sequence_lengths)
         # pytype: enable=attribute-error
@@ -1407,6 +1408,8 @@ def batch_predict(
             cls = self._last_token(text_transformed, sequence_lengths)
             cls_embed = self._tf_layers[f"embed.{TEXT}"](cls)
 
+            # pytype cannot infer that 'self._tf_layers[f"loss.{LABEL}"]' has methods
+            # like '.sim' or '.confidence_from_sim'
             # pytype: disable=attribute-error
             sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
                 cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 0a0af6ef73a7..cb286d495b7f 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -14,6 +14,7 @@
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
+from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING
 
 if typing.TYPE_CHECKING:
     import mitie
@@ -27,7 +28,7 @@ def required_components(cls) -> List[Type[Component]]:
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
         # the CLS token. Available options: 'mean' and 'max'
-        "pooling": "mean"
+        POOLING: MEAN_POOLING
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 5b441301fb99..dad98049427a 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -14,6 +14,7 @@
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
+from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING
 
 if typing.TYPE_CHECKING:
     from spacy.tokens import Doc
@@ -27,13 +28,13 @@ def required_components(cls) -> List[Type[Component]]:
     defaults = {
         # Specify what pooling operation should be used to calculate the vector of
         # the CLS token. Available options: 'mean' and 'max'
-        "pooling": "mean"
+        POOLING: MEAN_POOLING
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
         super().__init__(component_config)
 
-        self.pooling_operation = self.component_config["pooling"]
+        self.pooling_operation = self.component_config[POOLING]
 
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence / tokens."""
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index 2c814e4cc9a0..ef896a2a5ab8 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -4,6 +4,7 @@
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
 from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT
+from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING
 
 
 def sequence_to_sentence_features(
@@ -60,14 +61,15 @@ def _calculate_cls_vector(
         if non_zero_features.size == 0:
             return np.zeros([1, features.shape[-1]])
 
-        if pooling_operation == "mean":
+        if pooling_operation == MEAN_POOLING:
             return np.mean(non_zero_features, axis=0, keepdims=True)
-        elif pooling_operation == "max":
+        elif pooling_operation == MAX_POOLING:
             return np.max(non_zero_features, axis=0, keepdims=True)
         else:
             raise ValueError(
                 f"Invalid pooling operation specified. Available operations are "
-                f"'mean' or 'max', but provided value is '{pooling_operation}'."
+                f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is "
+                f"'{pooling_operation}'."
             )
 
 
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index 10c42a30e7a4..335c3fe0ece3 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -15,24 +15,40 @@
 
 
 def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
-    """Remove the BILOU prefix from the given tag."""
+    """Returns the BILOU prefix from the given tag.
 
+    Args:
+        tag: the tag
+
+    Returns: the BILOU prefix of the tag
+    """
     if tag[:2] in BILOU_PREFIXES:
         return tag[0]
     return None
 
 
 def entity_name_from_tag(tag: Text) -> Text:
-    """Remove the BILOU prefix from the given tag."""
+    """Remove the BILOU prefix from the given tag.
+
+    Args:
+        tag: the tag
 
+    Returns: the tag without the BILOU prefix
+    """
     if tag[:2] in BILOU_PREFIXES:
         return tag[2:]
     return tag
 
 
 def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
-    """Maps the entity tags of the message to the ids of the provided dict."""
+    """Maps the entity tags of the message to the ids of the provided dict.
 
+    Args:
+        message: the message
+        tag_id_dict: mapping of tags to ids
+
+    Returns: a list of tag ids
+    """
     if message.get(BILOU_ENTITIES):
         _tags = [
             tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict[NO_ENTITY_TAG]
@@ -45,14 +61,24 @@ def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
 
 
 def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
-    """Remove the BILOU prefixes from the given tags."""
+    """Removes the BILOU prefixes from the given list of tags.
+
+    Args:
+        tags: the list of tags
 
+    Returns: list of tags without BILOU prefix
+    """
     return [entity_name_from_tag(t) for t in tags]
 
 
 def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
-    """Create a mapping of unique tags to ids."""
+    """Create a mapping of unique tags to ids.
 
+    Args:
+        training_data: the training data
+
+    Returns: a mapping of tags to ids
+    """
     distinct_tags = set(
         [
             entity_name_from_tag(e)
@@ -75,8 +101,11 @@ def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
 
 
 def apply_bilou_schema(training_data: TrainingData) -> None:
-    """Gets a list of BILOU entity tags and sets them on the corresponding message."""
+    """Gets a list of BILOU entity tags and sets them on the given messages.
 
+    Args:
+        training_data: the training data
+    """
     for message in training_data.training_examples:
         entities = message.get(ENTITIES)
 
@@ -90,7 +119,13 @@ def apply_bilou_schema(training_data: TrainingData) -> None:
 
 
 def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
-    """Maps the entities of the given message to their start, end, and tag values."""
+    """Maps the entities of the given message to their start, end, and tag values.
+
+    Args:
+        message: the message
+
+    Returns: a list of start, end, and tag value tuples
+    """
 
     def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]:
         return entity["start"], entity["end"], entity["entity"]
@@ -103,8 +138,15 @@ def bilou_tags_from_offsets(
     entities: List[Tuple[int, int, Text]],
     missing: Text = NO_ENTITY_TAG,
 ) -> List[Text]:
-    """Creates a list of BILOU tags for the given list of tokens and entities."""
+    """Creates a list of BILOU tags for the given list of tokens and entities.
+
+    Args:
+        tokens: the list of tokens
+        entities: the list of start, end, and tag tuples
+        missing: tag for missing entities
 
+    Returns: a list of BILOU tags
+    """
     # From spacy.spacy.GoldParse, under MIT License
 
     start_pos_to_token_idx = {token.start: i for i, token in enumerate(tokens)}
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
index 3f33b9b8cf47..0ae86b3acbf6 100644
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -188,10 +188,12 @@ def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.ndarray:
         attention_mask = []
         max_seq_length = max(actual_sequence_lengths)
         for actual_sequence_length in actual_sequence_lengths:
-            attention_mask.append(
-                [1] * actual_sequence_length
-                + [0] * (max_seq_length - actual_sequence_length)
+            # add 1s for present tokens, fill up the remaining space up to max
+            # sequence length with 0s (non-existing tokens)
+            padded_sequence = [1] * actual_sequence_length + [0] * (
+                max_seq_length - actual_sequence_length
             )
+            attention_mask.append(padded_sequence)
 
         attention_mask = np.array(attention_mask).astype(np.float32)
 
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 97016c54844f..3e13221041d0 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -61,3 +61,7 @@
 
 BALANCED = "balanced"
 SEQUENCE = "sequence"
+
+POOLING = "pooling"
+MAX_POOLING = "max"
+MEAN_POOLING = "mean"
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 10dcbd5bccae..8bc7a9abb17f 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -29,6 +29,12 @@ def __init__(
         # will be updated when features are added
         self.num_examples = self.number_of_examples()
 
+    def get_only(self, key: Text) -> Optional[np.ndarray]:
+        if key in self.data:
+            return self.data[key][0]
+        else:
+            return None
+
     def get(self, key: Text) -> List[np.ndarray]:
         if key in self.data:
             return self.data[key]

From baab754a9b276c9e5105f518f6e5cde1cca2d733 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 09:52:20 +0100
Subject: [PATCH 618/633] review comments

---
 rasa/utils/io.py                    | 32 +++++++++++++++++++----
 rasa/utils/tensorflow/model_data.py | 40 ++++++++++++++++++++++-------
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index 8aac3ab23fbf..705da529ba5c 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -158,14 +158,25 @@ def dump_obj_as_json_to_file(filename: Text, obj: Any) -> None:
     write_text_file(json.dumps(obj, indent=2), filename)
 
 
-def pickle_dump(filename: Union[Text, Path], obj: Any):
-    """Saves object to file."""
+def pickle_dump(filename: Union[Text, Path], obj: Any) -> None:
+    """Saves object to file.
+
+    Args:
+        filename: the filename to save the object to
+        obj: the object to store
+    """
     with open(filename, "wb") as f:
         pickle.dump(obj, f)
 
 
 def pickle_load(filename: Union[Text, Path]) -> Any:
-    """Loads an object from a file."""
+    """Loads an object from a file.
+
+    Args:
+        filename: the filename to load the object from
+
+    Returns: the loaded object
+    """
     with open(filename, "rb") as f:
         return pickle.load(f)
 
@@ -410,7 +421,13 @@ def zip_folder(folder: Text) -> Text:
 
 
 def json_unpickle(file_name: Union[Text, Path]) -> Any:
-    """Unpickle an object from file using json."""
+    """Unpickle an object from file using json.
+
+    Args:
+        file_name: the file to load the object from
+
+    Returns: the object
+    """
     import jsonpickle.ext.numpy as jsonpickle_numpy
     import jsonpickle
 
@@ -421,7 +438,12 @@ def json_unpickle(file_name: Union[Text, Path]) -> Any:
 
 
 def json_pickle(file_name: Union[Text, Path], obj: Any) -> None:
-    """Pickle an object to a file using json."""
+    """Pickle an object to a file using json.
+
+    Args:
+        file_name: the file to store the object to
+        obj: the object to store
+    """
     import jsonpickle.ext.numpy as jsonpickle_numpy
     import jsonpickle
 
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 8bc7a9abb17f..566cb5b2ac5b 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -12,18 +12,36 @@
 logger = logging.getLogger(__name__)
 
 
+# Mapping of feature name to a list of numpy arrays representing the actual features
+# For example:
+# "text_features" -> [
+#   "numpy array containing dense features for every training example",
+#   "numpy array containing sparse features for every training example"
+# ]
 Data = Dict[Text, List[np.ndarray]]
 
 
 class FeatureSignature(NamedTuple):
+    """Stores the shape and the type (sparse vs dense) of features."""
+
     is_sparse: bool
     shape: List[int]
 
 
 class RasaModelData:
+    """Data object used for all RasaModels. It contains all features needed to train
+    the models."""
+
     def __init__(
         self, label_key: Optional[Text] = None, data: Optional[Data] = None
     ) -> None:
+        """
+        Initializes the RasaModelData object.
+
+        Args:
+            label_key: the label_key used for balancing, etc.
+            data: the data holding the features
+        """
         self.data = data or {}
         self.label_key = label_key
         # will be updated when features are added
@@ -67,8 +85,7 @@ def is_empty(self) -> bool:
     def number_of_examples(self) -> int:
         """Obtain number of examples in data.
 
-        Raise a ValueError if number of examples differ for different data in
-        session data.
+        Raises: A ValueError if number of examples differ for different features.
         """
         if not self.data:
             return 0
@@ -102,10 +119,12 @@ def split(
         self._check_label_key()
 
         if self.label_key is None:
+            # randomly split data as no label key is split
             multi_values = [v for values in self.data.values() for v in values]
             solo_values = [[] for values in self.data.values() for v in values]
             stratify = None
         else:
+            # make sure that examples for each label value are in both split sets
             label_ids = self._create_label_ids(self.data[self.label_key][0])
             label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
 
@@ -352,7 +371,7 @@ def append_type(features: np.ndarray) -> None:
 
     def _gen_batch(
         self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
-    ) -> Generator[Tuple, None, None]:
+    ) -> Generator[Tuple[Optional[np.ndarray]], None, None]:
         """Generate batches."""
 
         data = self.data
@@ -411,9 +430,9 @@ def _split_by_label_ids(
 
         label_data = []
         for label_id in unique_label_ids:
-            ids = label_ids == label_id
+            matching_ids = label_ids == label_id
             label_data.append(
-                RasaModelData(self.label_key, self._data_for_ids(data, ids))
+                RasaModelData(self.label_key, self._data_for_ids(data, matching_ids))
             )
         return label_data
 
@@ -438,7 +457,7 @@ def _convert_train_test_split(
         # train datasets have an even index
         index = 0
         for key, values in self.data.items():
-            for _ in range(len(values)):
+            for _ in values:
                 data_train[key].append(
                     self._combine_features(output_values[index * 2], solo_values[index])
                 )
@@ -524,22 +543,25 @@ def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
     def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
         """Convert a scipy matrix into indices, data, and shape."""
 
+        # we need to make sure that the matrices are coo_matrices otherwise the
+        # transformation does not work (e.g. you cannot access x.row, x.col)
         if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
             array_of_sparse = [x.tocoo() for x in array_of_sparse]
 
         max_seq_len = max([x.shape[0] for x in array_of_sparse])
 
+        # get the indices of values
         indices = np.hstack(
             [
                 np.vstack([i * np.ones_like(x.row), x.row, x.col])
                 for i, x in enumerate(array_of_sparse)
             ]
         ).T
+
         data = np.hstack([x.data for x in array_of_sparse])
 
-        shape = np.array(
-            (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
-        )
+        number_of_features = array_of_sparse[0].shape[-1]
+        shape = np.array((len(array_of_sparse), max_seq_len, number_of_features))
 
         return [
             indices.astype(np.int64),

From 1cb18d2ee59fe8b2bd8fd13e32df816a4ec2630a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 10:26:22 +0100
Subject: [PATCH 619/633] add docstring

---
 rasa/utils/tensorflow/models.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 1c7c4066e21c..48dafd731f21 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -18,7 +18,12 @@ class RasaModel(tf.keras.models.Model):
     Cannot be used as tf.keras.Model
     """
 
-    def __init__(self, random_seed: Optional[int] = None, **kwargs):
+    def __init__(self, random_seed: Optional[int] = None, **kwargs) -> None:
+        """Initialize the RasaModel.
+
+        Args:
+            random_seed: set the random seed to get reproducible results
+        """
         super().__init__(**kwargs)
 
         self.total_loss = tf.keras.metrics.Mean(name="t_loss")

From ccf25a4f7e9ce9e73375a50fc9b48da43514ae7e Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 10:40:24 +0100
Subject: [PATCH 620/633] add comments to model data

---
 rasa/utils/tensorflow/model_data.py | 46 +++++++++++++++++------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 4f6159d29b86..d552b7a44383 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -62,7 +62,7 @@ def number_of_examples(self) -> int:
         """Obtain number of examples in data.
 
         Raise a ValueError if number of examples differ for different data in
-        session data.
+        model data.
         """
         if not self.data:
             return 0
@@ -95,25 +95,30 @@ def split(
 
         self._check_label_key()
 
-        if self.label_key is None:
-            multi_values = [v for values in self.data.values() for v in values]
-            solo_values = [[] for values in self.data.values() for v in values]
-            stratify = None
-        else:
+        if self.label_key is not None:
             label_ids = self._create_label_ids(self.data[self.label_key][0])
             label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
 
             self._check_train_test_sizes(number_of_test_examples, label_counts)
 
             counts = np.array([label_counts[label] for label in label_ids])
+            # we perform stratified train test split,
+            # which insures every label is present in the test data
+            # this operation can be performed only for labels
+            # that contain several data points
             multi_values = [
                 v[counts > 1] for values in self.data.values() for v in values
             ]
+            # collect data points that are unique for their label
             solo_values = [
                 v[counts == 1] for values in self.data.values() for v in values
             ]
 
             stratify = label_ids[counts > 1]
+        else:
+            multi_values = [v for values in self.data.values() for v in values]
+            solo_values = [[] for values in self.data.values() for v in values]
+            stratify = None
 
         output_values = train_test_split(
             *multi_values,
@@ -179,13 +184,13 @@ def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
         }
 
     def shuffled_data(self, data: Data) -> Data:
-        """Shuffle session data."""
+        """Shuffle model data."""
 
         ids = np.random.permutation(self.num_examples)
         return self._data_for_ids(data, ids)
 
     def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
-        """Mix session data to account for class imbalance.
+        """Mix model data to account for class imbalance.
 
         This batching strategy puts rare classes approximately in every other batch,
         by repeating them. Mimics stratified batching, but also takes into account
@@ -205,11 +210,15 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
         )
         num_label_ids = len(unique_label_ids)
 
+        # group data points by their label
         # need to call every time, so that the data is shuffled inside each class
-        label_data = self._split_by_label_ids(data, label_ids, unique_label_ids)
+        data_by_label = self._split_by_label_ids(data, label_ids, unique_label_ids)
 
+        # running index inside each data grouped by labels
         data_idx = [0] * num_label_ids
+        # number of cycles each label was passed
         num_data_cycles = [0] * num_label_ids
+        # if a label was skipped in current batch
         skipped = [False] * num_label_ids
 
         new_data = defaultdict(list)
@@ -231,7 +240,7 @@ def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
                     int(counts_label_ids[index] / self.num_examples * batch_size) + 1
                 )
 
-                for k, values in label_data[index].items():
+                for k, values in data_by_label[index].items():
                     for i, v in enumerate(values):
                         if len(new_data[k]) < i + 1:
                             new_data[k].append([])
@@ -261,7 +270,7 @@ def prepare_batch(
         end: Optional[int] = None,
         tuple_sizes: Optional[Dict[Text, int]] = None,
     ) -> Tuple[Optional[np.ndarray]]:
-        """Slices session data into batch using given start and end value."""
+        """Slices model data into batch using given start and end value."""
 
         if not data:
             data = self.data
@@ -292,12 +301,12 @@ def prepare_batch(
                 else:
                     batch_data.append(self._pad_dense_data(_data))
 
-        # len of batch_data is equal to the number of keys in session data
+        # len of batch_data is equal to the number of keys in model data
         return tuple(batch_data)
 
     def batch_tuple_sizes(self) -> Dict[Text, int]:
 
-        # save the amount of placeholders attributed to session data keys
+        # save the amount of placeholders attributed to model data keys
         tuple_sizes = defaultdict(int)
 
         idx = 0
@@ -328,7 +337,7 @@ def as_tf_dataset(
         )
 
     def _get_shapes_types(self) -> Tuple:
-        """Extract shapes and types from session data."""
+        """Extract shapes and types from model data."""
 
         types = []
         shapes = []
@@ -404,7 +413,7 @@ def _check_train_test_sizes(
 
     @staticmethod
     def _data_for_ids(data: Data, ids: np.ndarray) -> Data:
-        """Filter session data by ids."""
+        """Filter model data by ids."""
 
         new_data = defaultdict(list)
 
@@ -419,7 +428,7 @@ def _data_for_ids(data: Data, ids: np.ndarray) -> Data:
     def _split_by_label_ids(
         self, data: Data, label_ids: np.ndarray, unique_label_ids: np.ndarray
     ) -> List["RasaModelData"]:
-        """Reorganize session data into a list of session data with the same labels."""
+        """Reorganize model data into a list of model data with the same labels."""
 
         label_data = []
         for label_id in unique_label_ids:
@@ -438,14 +447,13 @@ def _check_label_key(self):
     def _convert_train_test_split(
         self, output_values: List[Any], solo_values: List[Any]
     ) -> Tuple["RasaModelData", "RasaModelData"]:
-        """Convert the output of sklearn.model_selection.train_test_split into train and
-        eval session data."""
+        """Converts the output of sklearn's train_test_split into model data."""
 
         data_train = defaultdict(list)
         data_val = defaultdict(list)
 
         # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
-        # order is kept, e.g. same order as session data keys
+        # order is kept, e.g. same order as model data keys
 
         # train datasets have an even index
         index = 0

From eb5cf6b30a73f8310536fa18c0258d3b64d0fbc9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 10:47:42 +0100
Subject: [PATCH 621/633] create tmp dir for convert

---
 rasa/nlu/classifiers/diet_classifier.py | 10 +++++-----
 rasa/utils/io.py                        |  6 ++++++
 rasa/utils/train_utils.py               |  6 +++---
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index fa59cac40755..00d9d535af11 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -445,16 +445,16 @@ def _create_label_data(
         """
 
         # Collect one example for each label
-        labels_idx_example = []
+        labels_idx_examples = []
         for label_name, idx in label_id_dict.items():
             label_example = self._find_example_for_label(
                 label_name, training_data.intent_examples, attribute
             )
-            labels_idx_example.append((idx, label_example))
+            labels_idx_examples.append((idx, label_example))
 
         # Sort the list of tuples based on label_idx
-        labels_idx_example = sorted(labels_idx_example, key=lambda x: x[0])
-        labels_example = [example for (_, example) in labels_idx_example]
+        labels_idx_examples = sorted(labels_idx_examples, key=lambda x: x[0])
+        labels_example = [example for (_, example) in labels_idx_examples]
 
         # Collect features, precomputed if they exist, else compute on the fly
         if self._check_labels_features_exist(labels_example, attribute):
@@ -467,7 +467,7 @@ def _create_label_data(
         label_data = RasaModelData()
         label_data.add_features(LABEL_FEATURES, features)
 
-        label_ids = np.array([idx for (idx, _) in labels_idx_example])
+        label_ids = np.array([idx for (idx, _) in labels_idx_examples])
         # explicitly add last dimension to label_ids
         # to track correctly dynamic sequences
         label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])
diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index 705da529ba5c..946bb4487793 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -283,6 +283,12 @@ def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Te
     return f.name
 
 
+def create_temporary_directory() -> Text:
+    """Creates a tempfile.TemporaryDirectory."""
+    f = tempfile.TemporaryDirectory()
+    return f.name
+
+
 def create_path(file_path: Text) -> None:
     """Makes sure all directories in the 'file_path' exists."""
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 989737bf1986..62338631929f 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -5,8 +5,7 @@
 from rasa.core.constants import DIALOGUE
 from rasa.nlu.constants import TEXT
 from rasa.nlu.tokenizers.tokenizer import Token
-
-
+import rasa.utils.io as io_utils
 from rasa.utils.tensorflow.constants import (
     LABEL,
     HIDDEN_LAYERS_SIZES,
@@ -162,7 +161,8 @@ def load_tf_hub_model(model_url: Text) -> Any:
     try:
         return tfhub.load(model_url)
     except OSError:
-        os.environ["TFHUB_CACHE_DIR"] = "/tmp/tfhub"
+        directory = io_utils.create_temporary_directory()
+        os.environ["TFHUB_CACHE_DIR"] = directory
         return tfhub.load(model_url)
 
 

From 67dad7a511ba085c948a70d6aeff32844827b89d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 10:53:14 +0100
Subject: [PATCH 622/633] update type

---
 .../sparse_featurizer/lexical_syntactic_featurizer.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index 9458345d0598..bf0ddb39e308 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 import numpy as np
-from typing import Any, Dict, Optional, Text, List, Type
+from typing import Any, Dict, Optional, Text, List, Type, Union
 
 from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
 from rasa.constants import DOCS_URL_COMPONENTS
@@ -249,7 +249,7 @@ def _get_feature_value(
         token_idx: int,
         pointer_position: int,
         token_length: int,
-    ) -> Any:
+    ) -> Union[bool, int, Text]:
         if feature == END_OF_SENTENCE:
             return token_idx + pointer_position == token_length - 1
 

From e0ab5f7ab6e308e0e9fe120766efd1361c18dc76 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 10:58:34 +0100
Subject: [PATCH 623/633] add comments

---
 rasa/utils/tensorflow/layers.py      | 2 +-
 rasa/utils/tensorflow/transformer.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index b25ad97b151f..55c0ddbe3a0f 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -67,7 +67,7 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
 
 
 class DenseWithSparseWeights(tf.keras.layers.Dense):
-    def __init__(self, sparsity: float = 0.8, **kwargs) -> None:
+    def __init__(self, sparsity: float = 0.8, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.sparsity = sparsity
 
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index 4cbdfcb0d75b..c7a309db1c38 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -43,16 +43,19 @@ def __init__(
 
         self._depth = units // self.num_heads
 
+        # process queries
         self._wq = DenseWithSparseWeights(
             units=units, use_bias=False, sparsity=sparsity
         )
+        # process keys
         self._wk = DenseWithSparseWeights(
             units=units, use_bias=False, sparsity=sparsity
         )
+        # process values
         self._wv = DenseWithSparseWeights(
             units=units, use_bias=False, sparsity=sparsity
         )
-
+        # process attention output
         self._dense = DenseWithSparseWeights(units=units, sparsity=sparsity)
 
         self._create_relative_embeddings()

From 4cf5fffe6d87ad8baab7fffcfc6b707f50908782 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 11:03:16 +0100
Subject: [PATCH 624/633] change comment

---
 rasa/nlu/classifiers/diet_classifier.py | 2 +-
 rasa/nlu/selectors/response_selector.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 00d9d535af11..0039770fb8f6 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -566,7 +566,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         )
 
         if not label_id_index_mapping:
-            # no responses present to train
+            # no labels are present to train
             return RasaModelData()
 
         self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index ed6705c2f70a..4fc79b60889b 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -255,7 +255,7 @@ def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         )
 
         if not label_id_index_mapping:
-            # no responses present to train
+            # no labels are present to train
             return RasaModelData()
 
         self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)

From 4fefe7642dd895f8ca7154fc8739600f64be8142 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 11:17:10 +0100
Subject: [PATCH 625/633] recalculate number of examples after balancing

---
 rasa/utils/tensorflow/model_data.py | 37 +++++++++++++++++------------
 tests/utils/test_model_data.py      |  6 ++---
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 03bf0addf5ca..9ceb7f329303 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -44,7 +44,7 @@ def __init__(
         """
         self.data = data or {}
         self.label_key = label_key
-        # will be updated when features are added
+        # should be updated when features are added
         self.num_examples = self.number_of_examples()
 
     def get_only(self, key: Text) -> Optional[np.ndarray]:
@@ -82,21 +82,24 @@ def is_empty(self) -> bool:
         """Checks if data is set."""
         return not self.data
 
-    def number_of_examples(self) -> int:
+    def number_of_examples(self, data: Optional[Data] = None) -> int:
         """Obtain number of examples in data.
 
         Raises: A ValueError if number of examples differ for different features.
         """
 
-        if not self.data:
+        if not data:
+            data = self.data
+
+        if not data:
             return 0
 
-        example_lengths = [v.shape[0] for values in self.data.values() for v in values]
+        example_lengths = [v.shape[0] for values in data.values() for v in values]
 
         # check if number of examples is the same for all values
         if not all(length == example_lengths[0] for length in example_lengths):
             raise ValueError(
-                f"Number of examples differs for keys '{self.data.keys()}'. Number of "
+                f"Number of examples differs for keys '{data.keys()}'. Number of "
                 f"examples should be the same for all data."
             )
 
@@ -156,7 +159,10 @@ def split(
         return self._convert_train_test_split(output_values, solo_values)
 
     def add_features(self, key: Text, features: List[np.ndarray]):
-        """Add list of features to data under specified key."""
+        """Add list of features to data under specified key.
+
+        Should update number of examples.
+        """
 
         if not features:
             return
@@ -209,13 +215,13 @@ def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
             for key, values in self.data.items()
         }
 
-    def shuffled_data(self, data: Data) -> Data:
+    def _shuffled_data(self, data: Data) -> Data:
         """Shuffle model data."""
 
         ids = np.random.permutation(self.num_examples)
         return self._data_for_ids(data, ids)
 
-    def balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
+    def _balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
         """Mix model data to account for class imbalance.
 
         This batching strategy puts rare classes approximately in every other batch,
@@ -385,16 +391,17 @@ def _gen_batch(
         """Generate batches."""
 
         data = self.data
+        num_examples = self.num_examples
 
         if shuffle:
-            data = self.shuffled_data(data)
+            data = self._shuffled_data(data)
 
         if batch_strategy == BALANCED:
-            data = self.balanced_data(data, batch_size, shuffle)
+            data = self._balanced_data(data, batch_size, shuffle)
+            # after balancing, number of examples increased
+            num_examples = self.number_of_examples(data)
 
-        num_batches = self.num_examples // batch_size + int(
-            self.num_examples % batch_size > 0
-        )
+        num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
 
         for batch_num in range(num_batches):
             start = batch_num * batch_size
@@ -420,7 +427,7 @@ def _check_train_test_sizes(
             )
 
     @staticmethod
-    def _data_for_ids(data: Data, ids: np.ndarray) -> Data:
+    def _data_for_ids(data: Optional[Data], ids: np.ndarray) -> Data:
         """Filter model data by ids."""
 
         new_data = defaultdict(list)
@@ -434,7 +441,7 @@ def _data_for_ids(data: Data, ids: np.ndarray) -> Data:
         return new_data
 
     def _split_by_label_ids(
-        self, data: Data, label_ids: np.ndarray, unique_label_ids: np.ndarray
+        self, data: Optional[Data], label_ids: np.ndarray, unique_label_ids: np.ndarray
     ) -> List["RasaModelData"]:
         """Reorganize model data into a list of model data with the same labels."""
 
diff --git a/tests/utils/test_model_data.py b/tests/utils/test_model_data.py
index c34643dec5b0..627b37c23f1a 100644
--- a/tests/utils/test_model_data.py
+++ b/tests/utils/test_model_data.py
@@ -67,7 +67,7 @@ def test_shuffle_session_data(model_data: RasaModelData):
         np.array(list(before.values())) == np.array(list(model_data.values()))
     )
 
-    data = model_data.shuffled_data(model_data.data)
+    data = model_data._shuffled_data(model_data.data)
 
     # check that original data didn't change
     assert np.all(
@@ -170,7 +170,7 @@ def test_gen_batch(model_data: RasaModelData):
 
 
 def test_balance_model_data(model_data: RasaModelData):
-    data = model_data.balanced_data(model_data.data, 2, False)
+    data = model_data._balanced_data(model_data.data, 2, False)
 
     assert np.all(data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))
 
@@ -178,7 +178,7 @@ def test_balance_model_data(model_data: RasaModelData):
 def test_not_balance_model_data(model_data: RasaModelData):
     test_model_data = RasaModelData(label_key="tag_ids", data=model_data.data)
 
-    data = test_model_data.balanced_data(test_model_data.data, 2, False)
+    data = test_model_data._balanced_data(test_model_data.data, 2, False)
 
     assert np.all(data.get("tag_ids") == test_model_data.get("tag_ids"))
 

From 8443d5092963747941b3c196813fa83a4881162f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 11:22:23 +0100
Subject: [PATCH 626/633] reorganize methods in model_data

---
 rasa/utils/tensorflow/model_data.py | 271 ++++++++++++++--------------
 1 file changed, 138 insertions(+), 133 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 9ceb7f329303..0fbdba4e9d86 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -29,8 +29,10 @@ class FeatureSignature(NamedTuple):
 
 
 class RasaModelData:
-    """Data object used for all RasaModels. It contains all features needed to train
-    the models."""
+    """Data object used for all RasaModels.
+
+    It contains all features needed to train the models.
+    """
 
     def __init__(
         self, label_key: Optional[Text] = None, data: Optional[Data] = None
@@ -42,6 +44,7 @@ def __init__(
             label_key: the label_key used for balancing, etc.
             data: the data holding the features
         """
+
         self.data = data or {}
         self.label_key = label_key
         # should be updated when features are added
@@ -76,10 +79,12 @@ def first_data_example(self) -> Data:
 
     def feature_not_exist(self, key: Text) -> bool:
         """Check if feature key is present and features are available."""
+
         return key not in self.data or not self.data[key]
 
     def is_empty(self) -> bool:
         """Checks if data is set."""
+
         return not self.data
 
     def number_of_examples(self, data: Optional[Data] = None) -> int:
@@ -115,6 +120,46 @@ def feature_dimension(self, key: Text) -> int:
 
         return number_of_features
 
+    def add_features(self, key: Text, features: List[np.ndarray]):
+        """Add list of features to data under specified key.
+
+        Should update number of examples.
+        """
+
+        if not features:
+            return
+
+        if key in self.data:
+            raise ValueError(f"Key '{key}' already exists in RasaModelData.")
+
+        self.data[key] = []
+
+        for data in features:
+            if data.size > 0:
+                self.data[key].append(data)
+
+        if not self.data[key]:
+            del self.data[key]
+
+        # update number of examples
+        self.num_examples = self.number_of_examples()
+
+    def add_mask(self, key: Text, from_key: Text):
+        """Calculate mask for given key and put it under specified key."""
+
+        if not self.data.get(from_key):
+            return
+
+        self.data[key] = []
+
+        for data in self.data[from_key]:
+            if data.size > 0:
+                # explicitly add last dimension to mask
+                # to track correctly dynamic sequences
+                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
+                self.data[key].append(mask)
+                break
+
     def split(
         self, number_of_test_examples: int, random_seed: int
     ) -> Tuple["RasaModelData", "RasaModelData"]:
@@ -158,62 +203,112 @@ def split(
 
         return self._convert_train_test_split(output_values, solo_values)
 
-    def add_features(self, key: Text, features: List[np.ndarray]):
-        """Add list of features to data under specified key.
+    def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
+        """Get signature of RasaModelData.
 
-        Should update number of examples.
+        Signature stores the shape and whether features are sparse or not for every key.
         """
 
-        if not features:
-            return
+        return {
+            key: [
+                FeatureSignature(
+                    True if isinstance(v[0], scipy.sparse.spmatrix) else False,
+                    v[0].shape,
+                )
+                for v in values
+            ]
+            for key, values in self.data.items()
+        }
 
-        if key in self.data:
-            raise ValueError(f"Key '{key}' already exists in RasaModelData.")
+    def as_tf_dataset(
+        self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
+    ) -> tf.data.Dataset:
+        """Create tf dataset."""
 
-        self.data[key] = []
+        shapes, types = self._get_shapes_types()
 
-        for data in features:
-            if data.size > 0:
-                self.data[key].append(data)
+        return tf.data.Dataset.from_generator(
+            lambda batch_size_: self._gen_batch(batch_size_, batch_strategy, shuffle),
+            output_types=types,
+            output_shapes=shapes,
+            args=([batch_size]),
+        )
 
-        if not self.data[key]:
-            del self.data[key]
+    def prepare_batch(
+        self,
+        data: Optional[Data] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        tuple_sizes: Optional[Dict[Text, int]] = None,
+    ) -> Tuple[Optional[np.ndarray]]:
+        """Slices model data into batch using given start and end value."""
 
-        # update number of examples
-        self.num_examples = self.number_of_examples()
+        if not data:
+            data = self.data
 
-    def add_mask(self, key: Text, from_key: Text):
-        """Calculate mask for given key and put it under specified key."""
+        batch_data = []
 
-        if not self.data.get(from_key):
-            return
+        for key, values in data.items():
+            # add None for not present values during processing
+            if not values:
+                if tuple_sizes:
+                    batch_data += [None] * tuple_sizes[key]
+                else:
+                    batch_data.append(None)
+                continue
 
-        self.data[key] = []
+            for v in values:
+                if start is not None and end is not None:
+                    _data = v[start:end]
+                elif start is not None:
+                    _data = v[start:]
+                elif end is not None:
+                    _data = v[:end]
+                else:
+                    _data = v[:]
 
-        for data in self.data[from_key]:
-            if data.size > 0:
-                # explicitly add last dimension to mask
-                # to track correctly dynamic sequences
-                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
-                self.data[key].append(mask)
-                break
+                if isinstance(_data[0], scipy.sparse.spmatrix):
+                    batch_data.extend(self._scipy_matrix_to_values(_data))
+                else:
+                    batch_data.append(self._pad_dense_data(_data))
 
-    def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
-        """Get signature of RasaModelData.
+        # len of batch_data is equal to the number of keys in model data
+        return tuple(batch_data)
 
-        Signature stores the shape and whether features are sparse or not for every
-        key."""
+    def _get_shapes_types(self) -> Tuple:
+        """Extract shapes and types from model data."""
 
-        return {
-            key: [
-                FeatureSignature(
-                    True if isinstance(v[0], scipy.sparse.spmatrix) else False,
-                    v[0].shape,
-                )
-                for v in values
-            ]
-            for key, values in self.data.items()
-        }
+        types = []
+        shapes = []
+
+        def append_shape(features: np.ndarray) -> None:
+            if isinstance(features[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                shapes.append((None, features[0].ndim + 1))
+                shapes.append((None,))
+                shapes.append((features[0].ndim + 1))
+            elif features[0].ndim == 0:
+                shapes.append((None,))
+            elif features[0].ndim == 1:
+                shapes.append((None, features[0].shape[-1]))
+            else:
+                shapes.append((None, None, features[0].shape[-1]))
+
+        def append_type(features: np.ndarray) -> None:
+            if isinstance(features[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                types.append(tf.int64)
+                types.append(tf.float32)
+                types.append(tf.int64)
+            else:
+                types.append(tf.float32)
+
+        for values in self.data.values():
+            for v in values:
+                append_shape(v)
+                append_type(v)
+
+        return tuple(shapes), tuple(types)
 
     def _shuffled_data(self, data: Data) -> Data:
         """Shuffle model data."""
@@ -295,96 +390,6 @@ def _balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
 
         return final_data
 
-    def prepare_batch(
-        self,
-        data: Optional[Data] = None,
-        start: Optional[int] = None,
-        end: Optional[int] = None,
-        tuple_sizes: Optional[Dict[Text, int]] = None,
-    ) -> Tuple[Optional[np.ndarray]]:
-        """Slices model data into batch using given start and end value."""
-
-        if not data:
-            data = self.data
-
-        batch_data = []
-
-        for key, values in data.items():
-            # add None for not present values during processing
-            if not values:
-                if tuple_sizes:
-                    batch_data += [None] * tuple_sizes[key]
-                else:
-                    batch_data.append(None)
-                continue
-
-            for v in values:
-                if start is not None and end is not None:
-                    _data = v[start:end]
-                elif start is not None:
-                    _data = v[start:]
-                elif end is not None:
-                    _data = v[:end]
-                else:
-                    _data = v[:]
-
-                if isinstance(_data[0], scipy.sparse.spmatrix):
-                    batch_data.extend(self._scipy_matrix_to_values(_data))
-                else:
-                    batch_data.append(self._pad_dense_data(_data))
-
-        # len of batch_data is equal to the number of keys in model data
-        return tuple(batch_data)
-
-    def as_tf_dataset(
-        self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
-    ) -> tf.data.Dataset:
-        """Create tf dataset."""
-
-        shapes, types = self._get_shapes_types()
-
-        return tf.data.Dataset.from_generator(
-            lambda batch_size_: self._gen_batch(batch_size_, batch_strategy, shuffle),
-            output_types=types,
-            output_shapes=shapes,
-            args=([batch_size]),
-        )
-
-    def _get_shapes_types(self) -> Tuple:
-        """Extract shapes and types from model data."""
-
-        types = []
-        shapes = []
-
-        def append_shape(features: np.ndarray) -> None:
-            if isinstance(features[0], scipy.sparse.spmatrix):
-                # scipy matrix is converted into indices, data, shape
-                shapes.append((None, features[0].ndim + 1))
-                shapes.append((None,))
-                shapes.append((features[0].ndim + 1))
-            elif features[0].ndim == 0:
-                shapes.append((None,))
-            elif features[0].ndim == 1:
-                shapes.append((None, features[0].shape[-1]))
-            else:
-                shapes.append((None, None, features[0].shape[-1]))
-
-        def append_type(features: np.ndarray) -> None:
-            if isinstance(features[0], scipy.sparse.spmatrix):
-                # scipy matrix is converted into indices, data, shape
-                types.append(tf.int64)
-                types.append(tf.float32)
-                types.append(tf.int64)
-            else:
-                types.append(tf.float32)
-
-        for values in self.data.values():
-            for v in values:
-                append_shape(v)
-                append_type(v)
-
-        return tuple(shapes), tuple(types)
-
     def _gen_batch(
         self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
     ) -> Generator[Tuple[Optional[np.ndarray]], None, None]:

From 1c5f9da9ce65ad997d0d68273122f9d620310591 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 11:31:07 +0100
Subject: [PATCH 627/633] remove num_neg check from ted

---
 rasa/core/policies/ted_policy.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 41963b40315c..3a78c873f393 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -286,14 +286,6 @@ def train(
 
         self._label_data = self._create_label_data(domain)
 
-        # check if number of negatives is less than number of label_ids
-        if self.config[NUM_NEG] < domain.num_actions:
-            logger.debug(
-                f"Set '{NUM_NEG}' to the number of actions - 1, e.g. "
-                f"{domain.num_actions - 1}."
-            )
-        self.config[NUM_NEG] = min(self.config[NUM_NEG], domain.num_actions - 1)
-
         # extract actual training data to feed to model
         model_data = self._create_model_data(training_data.X, training_data.y)
         if model_data.is_empty():

From e06caaf9d7b9014f47fb72eb2677ec782756e94c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 11:54:34 +0100
Subject: [PATCH 628/633] update requirements

---
 alt_requirements/requirements_pretrained_embeddings_convert.txt | 2 +-
 requirements.txt                                                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/alt_requirements/requirements_pretrained_embeddings_convert.txt b/alt_requirements/requirements_pretrained_embeddings_convert.txt
index a7bc4d785bf8..ebc7906fc1d4 100644
--- a/alt_requirements/requirements_pretrained_embeddings_convert.txt
+++ b/alt_requirements/requirements_pretrained_embeddings_convert.txt
@@ -1,5 +1,5 @@
 # Minimum Install Requirements
 -r ../requirements.txt
 
-tensorflow_text>=2.1.0rc0
+tensorflow_text>=2.1.1
 tensorflow_hub==0.7.0
diff --git a/requirements.txt b/requirements.txt
index 81104797bba3..a0dcee7b536c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,4 +60,4 @@ PyJWT==1.7.1
 python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
-tensorflow-addons==0.7.0
+tensorflow-addons>=0.7.0

From 285467475ef3d157f097a0d0981a61fa48f64f72 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 26 Feb 2020 11:59:56 +0100
Subject: [PATCH 629/633] fix nlu comparison test

---
 rasa/nlu/test.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index f0331769108c..4a449f9e9b9c 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -40,7 +40,13 @@
 
 logger = logging.getLogger(__name__)
 
-ENTITY_PROCESSORS = {"EntitySynonymMapper"}
+# Exclude 'EmbeddingIntentClassifier' and 'ResponseSelector' as their super class
+# performs entity extraction but those two classifiers don't
+ENTITY_PROCESSORS = {
+    "EntitySynonymMapper",
+    "EmbeddingIntentClassifier",
+    "ResponseSelector",
+}
 
 CVEvaluationResult = namedtuple("Results", "train test")
 
@@ -1449,8 +1455,6 @@ def compare_nlu(
         train, test = data.train_test_split()
         write_to_file(test_path, test.nlu_as_markdown())
 
-        training_examples_per_run = []
-
         for percentage in exclusion_percentages:
             percent_string = f"{percentage}%_exclusion"
 

From 554ad4c5fb2cf8cfd258877c9c522c434d69e94d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 12:05:59 +0100
Subject: [PATCH 630/633] update requirements

---
 alt_requirements/requirements_pretrained_embeddings_convert.txt | 2 +-
 requirements.txt                                                | 2 +-
 setup.py                                                        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/alt_requirements/requirements_pretrained_embeddings_convert.txt b/alt_requirements/requirements_pretrained_embeddings_convert.txt
index ebc7906fc1d4..a7bc4d785bf8 100644
--- a/alt_requirements/requirements_pretrained_embeddings_convert.txt
+++ b/alt_requirements/requirements_pretrained_embeddings_convert.txt
@@ -1,5 +1,5 @@
 # Minimum Install Requirements
 -r ../requirements.txt
 
-tensorflow_text>=2.1.1
+tensorflow_text>=2.1.0rc0
 tensorflow_hub==0.7.0
diff --git a/requirements.txt b/requirements.txt
index a0dcee7b536c..6233aee22f95 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,4 +60,4 @@ PyJWT==1.7.1
 python-dateutil==2.8.0
 # for new featurizers
 tensorflow==2.1.0
-tensorflow-addons>=0.7.0
+tensorflow-addons==0.8.2
diff --git a/setup.py b/setup.py
index 7ba38f9875ea..bb703444796d 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@
     "numpy~=1.16",
     "scipy~=1.2",
     "tensorflow~=2.1",
+    "tensorflow-addons~=0.8",
     # absl is a tensorflow dependency, but produces double logging before 0.8
     # should be removed once tensorflow requires absl > 0.8 on its own
     "absl-py>=0.8.0",
@@ -84,7 +85,6 @@
     "SQLAlchemy~=1.3.0",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
-    "tensorflow-addons==0.7.0",
 ]
 
 extras_requires = {

From 727ed612128f3f71b1ee95c053868b70c5125830 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 26 Feb 2020 13:19:40 +0100
Subject: [PATCH 631/633] update version

---
 rasa/constants.py | 2 +-
 rasa/version.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/constants.py b/rasa/constants.py
index f53f308c1c94..2c4c61fd6394 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -49,7 +49,7 @@
 CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"]
 CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU
 
-MINIMUM_COMPATIBLE_VERSION = "1.6.0a2"
+MINIMUM_COMPATIBLE_VERSION = "1.8.0a1"
 
 GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml")
 
diff --git a/rasa/version.py b/rasa/version.py
index 8015df84ebfc..e317dc37ca9b 100644
--- a/rasa/version.py
+++ b/rasa/version.py
@@ -1,3 +1,3 @@
 # this file will automatically be changed,
 # do not add anything but the version number here!
-__version__ = "1.7.4"
+__version__ = "1.8.0a1"

From c33e4088ba720ed2f81ea50010ce3c941fb919d5 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Wed, 26 Feb 2020 13:51:08 +0100
Subject: [PATCH 632/633] Update
 alt_requirements/requirements_pretrained_embeddings_convert.txt

Co-Authored-By: Tobias Wochinger <t.wochinger@rasa.com>
---
 alt_requirements/requirements_pretrained_embeddings_convert.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alt_requirements/requirements_pretrained_embeddings_convert.txt b/alt_requirements/requirements_pretrained_embeddings_convert.txt
index a7bc4d785bf8..7a96d5bba9e3 100644
--- a/alt_requirements/requirements_pretrained_embeddings_convert.txt
+++ b/alt_requirements/requirements_pretrained_embeddings_convert.txt
@@ -1,5 +1,5 @@
 # Minimum Install Requirements
 -r ../requirements.txt
 
-tensorflow_text>=2.1.0rc0
+tensorflow_text==2.1.0rc0
 tensorflow_hub==0.7.0

From 19569767a6ea07204e4d54fe1b7a3b164bc3957f Mon Sep 17 00:00:00 2001
From: Alexander Khizov <a.khizov@rasa.com>
Date: Wed, 26 Feb 2020 15:01:06 +0100
Subject: [PATCH 633/633] Fixed an issue with AWS persistor

- It's possible now to specify region name for AWS S3
- Tests are setting mock region name now
---
 rasa/nlu/persistor.py       | 22 +++++++++++++++++-----
 tests/nlu/test_persistor.py |  8 +++++---
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/persistor.py b/rasa/nlu/persistor.py
index 43fdefc7178d..6d572655892d 100644
--- a/rasa/nlu/persistor.py
+++ b/rasa/nlu/persistor.py
@@ -113,12 +113,19 @@ class AWSPersistor(Persistor):
 
     Fetches them when needed, instead of storing them on the local disk."""
 
-    def __init__(self, bucket_name: Text, endpoint_url: Optional[Text] = None) -> None:
+    def __init__(
+        self,
+        bucket_name: Text,
+        endpoint_url: Optional[Text] = None,
+        region_name: Optional[Text] = None,
+    ) -> None:
         import boto3
 
         super().__init__()
-        self.s3 = boto3.resource("s3", endpoint_url=endpoint_url)
-        self._ensure_bucket_exists(bucket_name)
+        self.s3 = boto3.resource(
+            "s3", endpoint_url=endpoint_url, region_name=region_name
+        )
+        self._ensure_bucket_exists(bucket_name, region_name)
         self.bucket_name = bucket_name
         self.bucket = self.s3.Bucket(bucket_name)
 
@@ -132,11 +139,16 @@ def list_models(self) -> List[Text]:
             logger.warning(f"Failed to list models in AWS. {e}")
             return []
 
-    def _ensure_bucket_exists(self, bucket_name: Text) -> None:
+    def _ensure_bucket_exists(
+        self, bucket_name: Text, region_name: Optional[Text] = None
+    ) -> None:
         import boto3
         import botocore
 
-        bucket_config = {"LocationConstraint": boto3.DEFAULT_SESSION.region_name}
+        if not region_name:
+            region_name = boto3.DEFAULT_SESSION.region_name
+
+        bucket_config = {"LocationConstraint": region_name}
         # noinspection PyUnresolvedReferences
         try:
             self.s3.create_bucket(
diff --git a/tests/nlu/test_persistor.py b/tests/nlu/test_persistor.py
index 47300b61e858..3dba8e43b617 100644
--- a/tests/nlu/test_persistor.py
+++ b/tests/nlu/test_persistor.py
@@ -41,7 +41,7 @@ async def test_list_method_method_in_AWS_persistor(component_builder, tmpdir):
 def test_list_models_method_raise_exeception_in_AWS_persistor():
     os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
 
-    awspersistor = persistor.AWSPersistor("rasa-test")
+    awspersistor = persistor.AWSPersistor("rasa-test", region_name="foo")
     result = awspersistor.list_models()
 
     assert result == []
@@ -54,7 +54,9 @@ def test_retrieve_tar_archive_with_s3_namespace():
     destination = "dst"
     with patch.object(persistor.AWSPersistor, "_decompress") as decompress:
         with patch.object(persistor.AWSPersistor, "_retrieve_tar") as retrieve:
-            persistor.AWSPersistor("rasa-test").retrieve(model, destination)
+            persistor.AWSPersistor("rasa-test", region_name="foo").retrieve(
+                model, destination
+            )
         decompress.assert_called_once_with("model.tar.gz", destination)
         retrieve.assert_called_once_with(model)
 
@@ -65,7 +67,7 @@ def test_s3_private_retrieve_tar():
     # Ensure the S3 persistor writes to a filename `model.tar.gz`, whilst
     # passing the fully namespaced path to boto3
     model = "/my/s3/project/model.tar.gz"
-    awsPersistor = persistor.AWSPersistor("rasa-test")
+    awsPersistor = persistor.AWSPersistor("rasa-test", region_name="foo")
     with patch.object(awsPersistor.bucket, "download_fileobj") as download_fileobj:
         # noinspection PyProtectedMember
         awsPersistor._retrieve_tar(model)