diff --git a/.gitattributes b/.gitattributes
index 7fe70d7..39f6eb7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 *.json filter=lfs diff=lfs merge=lfs -text
+*.tsv filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 0c15734..52b9bd8 100644
--- a/README.md
+++ b/README.md
@@ -48,18 +48,33 @@ A [setup.py](./setup.py) file is provided in order to simplify the installation
     ```Python
     pip list | grep mtdnn
     ```  
+> For Mixed Precision and Distributed Training, please install NVIDIA apex by following instructions [here](https://github.com/NVIDIA/apex#linux)  
+
+## Run an example  
+An example Jupyter [notebook](./examples/classification/tc_mnli.ipynb) is provided to show a runnable example using the MNLI dataset. The notebook reads and loads the MNLI data provided for your convenience [here](./sample_data).  This dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators.  
+
+> **NOTE:** The MNLI data is very large and would need [Git LFS](https://docs.github.com/en/github/managing-large-files/installing-git-large-file-storage) installed on your machine to pull it down.  
+
+## How To Use  
+
 
-## How To Use
 1. Create a model configuration object, `MTDNNConfig`, with the necessary parameters to initialize the MT-DNN model. Initialization without any parameters will default to a similar configuration that initializes a BERT model. This configuration object can be initialized wit training and learning parameters like `batch_size` and `learning_rate`. Please consult the class implementation for all parameters.   
 
     ```Python
     BATCH_SIZE = 16
-    config = MTDNNConfig(batch_size=BATCH_SIZE)
+    MULTI_GPU_ON = True
+    MAX_SEQ_LEN = 128
+    NUM_EPOCHS = 5
+    config = MTDNNConfig(batch_size=BATCH_SIZE, 
+                        max_seq_len=MAX_SEQ_LEN, 
+                        multi_gpu_on=MULTI_GPU_ON)
     ```
 
-1. Define the task parameters to train for and initialize an `MTDNNTaskDefs` object.  
+1. Define the task parameters to train for and initialize an `MTDNNTaskDefs` object. Definition can be a single or multiple tasks to train. MTDNNTaskDefs can take a python dict, yaml or json file with task(s) defintion. 
 
     ```Python
+    DATA_DIR = "../../sample_data/"
+    DATA_SOURCE_DIR = os.path.join(DATA_DIR, "MNLI")
     tasks_params = {
                     "mnli": {
                         "data_format": "PremiseAndOneHypothesis",
@@ -73,27 +88,52 @@ A [setup.py](./setup.py) file is provided in order to simplify the installation
                         "n_class": 3,
                         "split_names": [
                             "train",
-                            "matched_dev",
-                            "mismatched_dev",
-                            "matched_test",
-                            "mismatched_test",
+                            "dev_matched",
+                            "dev_mismatched",
+                            "test_matched",
+                            "test_mismatched",
                         ],
+                        "data_source_dir": DATA_SOURCE_DIR,
+                        "data_process_opts": {"header": True, "is_train": True, "multi_snli": False,},
                         "task_type": "Classification",
                     },
                 }
+
+    # Define the tasks
     task_defs = MTDNNTaskDefs(tasks_params)
     ```
 
+1. Create a data tokenizing object, `MTDNNTokenizer`. Based on the model initial checkpoint, it wraps around the model's Huggingface transformers library to encode the data to **MT-DNN** format. This becomes the input to the data building stage.  
+
+    ```
+    tokenizer = MTDNNTokenizer(do_lower_case=True)
+
+    # Testing out the tokenizer  
+    print(tokenizer.encode("What NLP toolkit do you recommend", "MT-DNN is a fantastic toolkit"))  
+    
+    # ([101, 2054, 17953, 2361, 6994, 23615, 2079, 2017, 16755, 102, 11047, 1011, 1040, 10695, 2003, 1037, 10392, 6994, 23615, 102], None, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    ```
+
+1. Create a data preprocessing object, `MTDNNDataBuilder`. This class is responsible for converting the data into the MT-DNN format depending on the task. This object is responsible for creating the vectorized data for each task.   
+
+    ```
+    ## Load and build data
+    data_builder = MTDNNDataBuilder(tokenizer=tokenizer,
+                                    task_defs=task_defs,
+                                    data_dir=DATA_SOURCE_DIR,
+                                    canonical_data_suffix="canonical_data",
+                                    dump_rows=True)
+
+    ## Build data to MTDNN Format as an iterable of each specific task
+    vectorized_data = data_builder.vectorize()
+    ```
+ 
 1. Create a data preprocessing object, `MTDNNDataProcess`. This creates the training, test and development PyTorch dataloaders needed for training and testing. We also need to retrieve the necessary training options required to initialize the model correctly, for all tasks.  
 
     ```Python
-    data_processor = MTDNNDataProcess(
-        config=config,
-        task_defs=task_defs,
-        data_dir="/home/useradmin/sources/mt-dnn/data/canonical_data/bert_uncased_lower",
-        train_datasets_list=["mnli"],
-        test_datasets_list=["mnli_mismatched", "mnli_matched"],
-    )
+    data_processor = MTDNNDataProcess(config=config, 
+                                    task_defs=task_defs, 
+                                    vectorized_data=vectorized_data)
 
     # Retrieve the multi task train, dev and test dataloaders
     multitask_train_dataloader = data_processor.get_train_dataloader()
@@ -131,8 +171,7 @@ A [setup.py](./setup.py) file is provided in order to simplify the installation
 1. At this point the MT-DNN model allows us to fit to the model and create predictions. The fit takes an optional `epochs` parameter that overwrites the epochs set in the `MTDNNConfig` object. 
 
     ```Python
-    model.fit()
-    model.predict()
+    model.fit(epochs=NUM_EPOCHS)
     ```
 
 
@@ -141,7 +180,7 @@ Optionally using a previously trained model as checkpoint.
 
     ```Python
     # Predict using a PyTorch model checkpoint
-    checkpt = "./model_0.pt"
+    checkpt = "./checkpoint/model_4.pt"
     model.predict(trained_model_chckpt=checkpt)
 
     ```
diff --git a/_config.yml b/_config.yml
new file mode 100644
index 0000000..c419263
--- /dev/null
+++ b/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-cayman
\ No newline at end of file
diff --git a/ci/component_governance.yml b/ci/component_governance.yml
new file mode 100644
index 0000000..7b71052
--- /dev/null
+++ b/ci/component_governance.yml
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# Pull request against these branches will trigger this build
+pr: 
+  - master
+
+# no CI trigger
+trigger: none
+
+jobs:
+- job: Component_governance
+  timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+  pool:
+    vmImage: 'ubuntu-16.04'
+
+  steps:
+  - bash: |
+      python scripts/generate_requirements_txt.py
+    displayName: 'Generate requirements.txt file from generate_conda_file.py'
+
+  - task: ComponentGovernanceComponentDetection@0
+    inputs:
+      scanType: 'Register'
+      verbosity: 'Verbose'
+      alertWarningLevel: 'High'  
diff --git a/examples/classification/tc_mnli.ipynb b/examples/classification/tc_mnli.ipynb
index 30d8c55..e14fa77 100644
--- a/examples/classification/tc_mnli.ipynb
+++ b/examples/classification/tc_mnli.ipynb
@@ -4,72 +4,134 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Copyright (c) Microsoft Corporation. All rights reserved.\n",
-    "### Licensed under the MIT License."
+    "*Copyright (c) Microsoft Corporation. All rights reserved.*\n",
+    "\n",
+    "*Licensed under the MIT License.*\n",
+    "\n",
+    "# The Microsoft Toolkit of Multi-Task Deep Neural Networks for Natural Language Understanding\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "MT-DNN is an open-source natural language understanding (NLU) toolkit that makes it easy for researchers and developers to train customized deep learning models. Built upon PyTorch and Transformers, MT-DNN is designed to facilitate rapid\n",
+    "customization for a broad spectrum of NLU tasks, using a variety of objectives (classification, regression, structured prediction) and text encoders (e.g., RNNs, BERT, RoBERTa, UniLM). A unique feature of MT-DNN is its built-in support for robust and transferable learning using the adversarial multi-task learning paradigm. To enable efficient production deployment, MT-DNN supports multitask knowledge distillation, which can substantially compress a deep neural model without significant performance drop. We demonstrate the effectiveness of MT-DNN on a wide range of NLU applications across general and biomedical domains. The pip installable package and pretrained models will be publicly available at https://github.com/microsoft/mt-dnn."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Design"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Multi-Task Deep Neural Networks for Natural Language Understanding  \n",
+    "MT-DNN is designed for modularity, flexibility, and ease of use. These modules are built upon PyTorch (Paszke et al., 2019) and Transformers (Wolf\n",
+    "et al., 2019), allowing the use of the SOTA pretrained models, e.g., BERT (Devlin et al., 2019), RoBERTa (Liu et al., 2019c) and UniLM (Dong\n",
+    "et al., 2019). The unique attribute of this package is a flexible interface for adversarial multi-task fine-tuning and knowledge distillation, so that researchers and developers can build large SOTA NLU models and then compress them to small ones\n",
+    "for online deployment.The overall workflow and system architecture are shown in figures 1 and 3 respectively.\n",
     "\n",
     "\n",
-    "This PyTorch package implements the Multi-Task Deep Neural Networks (MT-DNN) for Natural Language Understanding. "
+    "![Workflow Design](https://nlpbp.blob.core.windows.net/images/mt-dnn2.JPG)\n",
+    "\n",
+    "The above figure shows workflow of MT-DNN: train a neural language model on a large amount of unlabeled raw text\n",
+    "to obtain general contextual representations; then finetune the learned contextual representation on downstream tasks, e.g. GLUE (Wang et al., 2018); lastly, distill this large model to a lighter one for online deployment. In the later two phrases, we can leverage powerful multi-task learning and adversarial training to further improve performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Architecture"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### The data  \n",
+    "![overall_arch](https://nlpbp.blob.core.windows.net/images/mt-dnn.png)\n",
+    "The figure above shows the overall system architecture. The lower layers are shared across all tasks while the top layers are taskspecific. The input X (either a sentence or a set of sentences) is first represented as a sequence of embedding\n",
+    "vectors, one for each word, in l1. Then the encoder, e.g a Transformer or recurrent neural network (LSTM) model,\n",
+    "captures the contextual information for each word and generates the shared contextual embedding vectors in l2.\n",
+    "Finally, for each task, additional task-specific layers generate task-specific representations, followed by operations\n",
+    "necessary for classification, similarity scoring, or relevance ranking. In case of adversarial training, we perturb\n",
+    "embeddings from the lexicon encoder and then add an extra loss term during the training. Note that for the\n",
+    "inference phrase, it does not require perturbations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "In this notebook, we fine-tune and evaluate MT-DNN models on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.  \n",
+    "\n",
+    "### Running Time\n",
+    "\n",
+    "This is a __computationally intensive__ notebook that runs on the entire MNLI dataset for match and mismatched datasets for training, development and test.  \n",
     "\n",
-    "This notebook assumes you have data already pre-processed in the MT-DNN format and accessible in a local directory.  \n",
+    "The table below provides some reference running time on a GPU machine.  \n",
     "\n",
+    "|Dataset|MULTI_GPU_ON|Machine Configurations|Running time|\n",
+    "|:------|:---------|:----------------------|:------------|\n",
+    "|MultiNLI|True|4 NVIDIA Tesla K80 GPUs, 24GB GPU memory| ~ 20 hours |\n",
     "\n",
-    "For the purposes of this example we have added sample data that is already processed in MT-DNN format which can be found in the __sample_data__ folder. "
+    "If you run into `CUDA out-of-memory error` or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LEN` in `MTDNNConfig`, but note that model performance may be compromised.\n"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 8,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
    "source": [
-    "%load_ext autoreload"
+    "\n",
+    "### Text Classification of MultiNLI Sentences using MT-DNN\n",
+    "\n",
+    "This notebook utilizes the pip installable package that implements the Multi-Task Deep Neural Network Toolkit (MTDNN) for Natural Language Understanding. It's recommended to run this notebook on GPU machines as it's very computationally intensive."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
+    "%load_ext autoreload\n",
     "%autoreload 2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import json\n",
+    "import os\n",
+    "import shutil\n",
+    "import sys\n",
+    "from tempfile import TemporaryDirectory\n",
+    "\n",
+    "import pandas as pd\n",
     "import torch\n",
     "\n",
     "from mtdnn.common.types import EncoderModelType\n",
     "from mtdnn.configuration_mtdnn import MTDNNConfig\n",
+    "from mtdnn.data_builder_mtdnn import MTDNNDataBuilder\n",
     "from mtdnn.modeling_mtdnn import MTDNNModel\n",
     "from mtdnn.process_mtdnn import MTDNNDataProcess\n",
-    "from mtdnn.tasks.config import MTDNNTaskDefs"
+    "from mtdnn.tasks.config import MTDNNTaskDefs\n",
+    "from mtdnn.tokenizer_mtdnn import MTDNNTokenizer"
    ]
   },
   {
@@ -81,12 +143,93 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
-    "DATA_DIR = \"../../sample_data/bert_uncased_lower/mnli/\"\n",
-    "BATCH_SIZE = 16"
+    "# Define Configuration, Tasks and Model Objects\n",
+    "ROOT_DIR = TemporaryDirectory().name\n",
+    "OUTPUT_DIR = os.path.join(ROOT_DIR, 'checkpoint')\n",
+    "os.makedirs(OUTPUT_DIR) if not os.path.exists(OUTPUT_DIR) else OUTPUT_DIR\n",
+    "\n",
+    "LOG_DIR = os.path.join(ROOT_DIR, 'tensorboard_logdir')\n",
+    "os.makedirs(LOG_DIR) if not os.path.exists(LOG_DIR) else LOG_DIR\n",
+    "\n",
+    "DATA_DIR = \"../../sample_data/\"\n",
+    "DATA_SOURCE_DIR = os.path.join(DATA_DIR, \"MNLI\")\n",
+    "\n",
+    "# Training parameters\n",
+    "BATCH_SIZE = 16\n",
+    "MULTI_GPU_ON = True\n",
+    "MAX_SEQ_LEN = 128\n",
+    "NUM_EPOCHS = 5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Exploring the location for our data to be downloaded, model to be checkpointed and logs to be dumped"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tmp/tmpd9ok4aeo/checkpoint\n",
+      "tmp/tmpd9ok4aeo/tensorboard_logdir\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(OUTPUT_DIR)\n",
+    "print(LOG_DIR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read Dataset\n",
+    "We start by loading a subset of the data. The MNLI dataset has been downloaded for your convenience and added as part of this repository. The data can be found in `../../sample_data/MNLI`.   \n",
+    "\n",
+    "The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators.  \n",
+    "\n",
+    "> **NOTE:** The MNLI data is very large and would need [Git LFS](https://docs.github.com/en/github/managing-large-files/installing-git-large-file-storage) installed on your machine to pull it down.  \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['canonical_data',\n",
+       " 'README.txt',\n",
+       " 'train.tsv',\n",
+       " 'dev_matched.tsv',\n",
+       " 'diagnostic.tsv',\n",
+       " 'test_mismatched.tsv',\n",
+       " 'diagnostic-full.tsv',\n",
+       " 'dev_mismatched.tsv',\n",
+       " 'test_matched.tsv']"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "assert os.path.exists(DATA_SOURCE_DIR), \"[ERROR] - The MNLI Dataset cannot be found.\"\n",
+    "os.listdir(DATA_SOURCE_DIR)"
    ]
   },
   {
@@ -95,16 +238,18 @@
    "source": [
     "### Define a Configuration Object \n",
     "\n",
-    "Create a model configuration object, `MTDNNConfig`, with the necessary parameters to initialize the MT-DNN model. Initialization without any parameters will default to a similar configuration that initializes a BERT model. \n"
+    "Create a model configuration object, `MTDNNConfig`, with the necessary parameters to initialize the MT-DNN model. Initialization without any parameters will default to a similar configuration that initializes a BERT model. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
-    "config = MTDNNConfig(batch_size=BATCH_SIZE)"
+    "config = MTDNNConfig(batch_size=BATCH_SIZE, \n",
+    "                     max_seq_len=MAX_SEQ_LEN, \n",
+    "                     multi_gpu_on=MULTI_GPU_ON)"
    ]
   },
   {
@@ -114,45 +259,51 @@
     "\n",
     "### Create Task Definition Object  \n",
     "\n",
-    "Define the task parameters to train for and initialize an `MTDNNTaskDefs` object. Create a task parameter dictionary. Definition can be a single or multiple tasks to train.  `MTDNNTaskDefs` can take a python dict, yaml or json file with task(s) defintion."
+    "Define the task parameters to train for and initialize an `MTDNNTaskDefs` object. Create a task parameter dictionary. Definition can be a single or multiple tasks to train.  `MTDNNTaskDefs` can take a python dict, yaml or json file with task(s) defintion.\n",
+    "\n",
+    "The data source directory is the path of data downloaded and extracted above using `download_tsv_files_and_extract` which is the `MNLI` dir under the `DATA_DIR` temporary directory.    \n",
+    "\n",
+    "The data source has options that are set to drive each task pre-processing; `data_process_opts`\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO - Mapping Task attributes\n",
-      "INFO - Configured task definitions - ['mnli']\n"
+      "07/01/2020 04:42:22 - mtdnn.tasks.config - INFO - Mapping Task attributes\n",
+      "07/01/2020 04:42:22 - mtdnn.tasks.config - INFO - Configured task definitions - ['mnli']\n"
      ]
     }
    ],
    "source": [
     "tasks_params = {\n",
-    "        \"mnli\": {\n",
-    "            \"data_format\": \"PremiseAndOneHypothesis\",\n",
-    "            \"encoder_type\": \"BERT\",\n",
-    "            \"dropout_p\": 0.3,\n",
-    "            \"enable_san\": True,\n",
-    "            \"labels\": [\"contradiction\", \"neutral\", \"entailment\"],\n",
-    "            \"metric_meta\": [\"ACC\"],\n",
-    "            \"loss\": \"CeCriterion\",\n",
-    "            \"kd_loss\": \"MseCriterion\",\n",
-    "            \"n_class\": 3,\n",
-    "            \"split_names\": [\n",
-    "                \"train\",\n",
-    "                \"matched_dev\",\n",
-    "                \"mismatched_dev\",\n",
-    "                \"matched_test\",\n",
-    "                \"mismatched_test\",\n",
-    "            ],\n",
-    "            \"task_type\": \"Classification\",\n",
-    "        },\n",
-    "    }\n",
+    "    \"mnli\": {\n",
+    "        \"data_format\": \"PremiseAndOneHypothesis\",\n",
+    "        \"encoder_type\": \"BERT\",\n",
+    "        \"dropout_p\": 0.3,\n",
+    "        \"enable_san\": True,\n",
+    "        \"labels\": [\"contradiction\", \"neutral\", \"entailment\"],\n",
+    "        \"metric_meta\": [\"ACC\"],\n",
+    "        \"loss\": \"CeCriterion\",\n",
+    "        \"kd_loss\": \"MseCriterion\",\n",
+    "        \"n_class\": 3,\n",
+    "        \"split_names\": [\n",
+    "            \"train\",\n",
+    "            \"dev_matched\",\n",
+    "            \"dev_mismatched\",\n",
+    "            \"test_matched\",\n",
+    "            \"test_mismatched\",\n",
+    "        ],\n",
+    "        \"data_source_dir\": DATA_SOURCE_DIR,\n",
+    "        \"data_process_opts\": {\"header\": True, \"is_train\": True, \"multi_snli\": False,},\n",
+    "        \"task_type\": \"Classification\",\n",
+    "    },\n",
+    "}\n",
     "\n",
     "# Define the tasks\n",
     "task_defs = MTDNNTaskDefs(tasks_params)"
@@ -163,59 +314,284 @@
    "metadata": {},
    "source": [
     "\n",
-    "### Create the Data Processing Object  \n",
-    "\n",
-    "Create a data preprocessing object, `MTDNNDataProcess`. This creates the training, test and development PyTorch dataloaders needed for training and testing. We also need to retrieve the necessary training options required to initialize the model correctly, for all tasks.  \n",
+    "### Create the MTDNN Data Tokenizer Object  \n",
     "\n",
-    "Define a data process that handles creating the training, test and development PyTorch dataloaders"
+    "Create a data tokenizing object, `MTDNNTokenizer`. Based on the model initial checkpoint, it wraps around the model's Huggingface transformers library to encode the data to MT-DNN format. This becomes the input to the data building stage.  \n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = MTDNNTokenizer(do_lower_case=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Testing out the Tokenizer encode function on a sample text\n",
+    "`tokenizer.encode(\"What NLP toolkit do you recommend\", \"MT-DNN is a fantastic toolkit\")`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "([101, 2054, 17953, 2361, 6994, 23615, 2079, 2017, 16755, 102, 11047, 1011, 1040, 10695, 2003, 1037, 10392, 6994, 23615, 102], None, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tokenizer.encode(\"What NLP toolkit do you recommend\", \"MT-DNN is a fantastic toolkit\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "### Create the Data Builder Object  \n",
+    "\n",
+    "Create a data preprocessing object, `MTDNNDataBuilder`. This class is responsible for converting the data into the MT-DNN format depending on the task.  \n",
+    " \n",
+    "\n",
+    "Define a data builder that handles the creating of each task's vectorized data utilizing the model tokenizer. This will build out the vectorized data needed for creating the training, test and development PyTorch dataloaders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/30/2020 07:00:07 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 392702 samples for mnli at ../../sample_data/MNLI/canonical_data/mnli_train.tsv\n",
+      "06/30/2020 07:00:08 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9815 samples for mnli at ../../sample_data/MNLI/canonical_data/mnli_dev_matched.tsv\n",
+      "06/30/2020 07:00:08 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9832 samples for mnli at ../../sample_data/MNLI/canonical_data/mnli_dev_mismatched.tsv\n",
+      "06/30/2020 07:00:08 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9796 samples for mnli at ../../sample_data/MNLI/canonical_data/mnli_test_matched.tsv\n",
+      "06/30/2020 07:00:08 - mtdnn.data_builder_mtdnn - INFO - Sucessfully loaded and built 9847 samples for mnli at ../../sample_data/MNLI/canonical_data/mnli_test_mismatched.tsv\n",
+      "mnli_train\n",
+      "06/30/2020 07:00:08 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI TRAIN' Task\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Building Data For Premise and One Hypothesis: 392702it [05:08, 1272.59it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/30/2020 07:05:18 - mtdnn.data_builder_mtdnn - INFO - Saving data to ../../sample_data/MNLI/canonical_data/bert_base_uncased/mnli_train.json\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO - Starting to process the training data sets\n",
-      "INFO - Loading ../../sample_data/bert_uncased_lower/mnli/mnli_train.json as task 0\n"
+      "\n",
+      "Saving Data For PremiseAndOneHypothesis: 100%|██████████| 392702/392702 [00:05<00:00, 68365.81it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loaded 392702 samples out of 392702\n"
+      "mnli_dev_matched\n",
+      "06/30/2020 07:05:24 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI DEV MATCHED' Task\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "INFO - Starting to process the testing data sets\n"
+      "\n",
+      "Building Data For Premise and One Hypothesis: 9815it [00:07, 1277.52it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loaded 9832 samples out of 9832\n",
-      "Loaded 9847 samples out of 9847\n",
-      "Loaded 9815 samples out of 9815\n",
-      "Loaded 9796 samples out of 9796\n"
+      "06/30/2020 07:05:32 - mtdnn.data_builder_mtdnn - INFO - Saving data to ../../sample_data/MNLI/canonical_data/bert_base_uncased/mnli_dev_matched.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9815/9815 [00:00<00:00, 62891.81it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mnli_dev_mismatched\n",
+      "06/30/2020 07:05:32 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI DEV MISMATCHED' Task\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Building Data For Premise and One Hypothesis: 9832it [00:07, 1250.76it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/30/2020 07:05:40 - mtdnn.data_builder_mtdnn - INFO - Saving data to ../../sample_data/MNLI/canonical_data/bert_base_uncased/mnli_dev_mismatched.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9832/9832 [00:00<00:00, 69022.28it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mnli_test_matched\n",
+      "06/30/2020 07:05:40 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI TEST MATCHED' Task\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Building Data For Premise and One Hypothesis: 9796it [00:07, 1300.80it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/30/2020 07:05:47 - mtdnn.data_builder_mtdnn - INFO - Saving data to ../../sample_data/MNLI/canonical_data/bert_base_uncased/mnli_test_matched.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9796/9796 [00:00<00:00, 71259.18it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mnli_test_mismatched\n",
+      "06/30/2020 07:05:47 - mtdnn.data_builder_mtdnn - INFO - Building Data For 'MNLI TEST MISMATCHED' Task\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Building Data For Premise and One Hypothesis: 9847it [00:07, 1242.78it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/30/2020 07:05:55 - mtdnn.data_builder_mtdnn - INFO - Saving data to ../../sample_data/MNLI/canonical_data/bert_base_uncased/mnli_test_mismatched.json\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Saving Data For PremiseAndOneHypothesis: 100%|██████████| 9847/9847 [00:00<00:00, 69755.60it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Load and build data\n",
+    "data_builder = MTDNNDataBuilder(\n",
+    "    tokenizer=tokenizer,\n",
+    "    task_defs=task_defs,\n",
+    "    data_dir=DATA_SOURCE_DIR,\n",
+    "    canonical_data_suffix=\"canonical_data\",\n",
+    "    dump_rows=True,\n",
+    ")\n",
+    "\n",
+    "## Build data to MTDNN Format\n",
+    "## Iterable of each specific task and processed data\n",
+    "vectorized_data = data_builder.vectorize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the Data Processing Object  \n",
+    "\n",
+    "Create a data preprocessing object, `MTDNNDataProcess`. This creates the training, test and development PyTorch dataloaders needed for training and testing. We also need to retrieve the necessary training options required to initialize the model correctly, for all tasks.  \n",
+    "\n",
+    "Define a data process that handles creating the training, test and development PyTorch dataloaders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Starting to process the training data sets\n",
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Loading mnli_train as task 0\n",
+      "07/01/2020 04:42:36 - mtdnn.dataset_mtdnn - INFO - Loaded 391533 samples out of 392702\n",
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Starting to process the testing data sets\n",
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Loading mnli_dev_matched as task 0\n",
+      "07/01/2020 04:42:36 - mtdnn.dataset_mtdnn - INFO - Loaded 9815 samples out of 9815\n",
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Loading mnli_dev_mismatched as task 0\n",
+      "07/01/2020 04:42:36 - mtdnn.dataset_mtdnn - INFO - Loaded 9832 samples out of 9832\n",
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Loading mnli_test_matched as task 0\n",
+      "07/01/2020 04:42:36 - mtdnn.dataset_mtdnn - INFO - Loaded 9796 samples out of 9796\n",
+      "07/01/2020 04:42:36 - mtdnn.process_mtdnn - INFO - Loading mnli_test_mismatched as task 0\n",
+      "07/01/2020 04:42:36 - mtdnn.dataset_mtdnn - INFO - Loaded 9847 samples out of 9847\n"
      ]
     }
    ],
    "source": [
     "# Make the Data Preprocess step and update the config with training data updates\n",
     "data_processor = MTDNNDataProcess(\n",
-    "    config=config,\n",
-    "    task_defs=task_defs,\n",
-    "    data_dir=DATA_DIR,\n",
-    "    train_datasets_list=[\"mnli\"],\n",
-    "    test_datasets_list=[\"mnli_mismatched\", \"mnli_matched\"],\n",
+    "    config=config, task_defs=task_defs, vectorized_data=vectorized_data\n",
     ")"
    ]
   },
@@ -228,7 +604,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -241,12 +617,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Get training options to initialize model"
+    "Now we can retrieve the training options, from the processor, to initialize model with."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -267,7 +643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -285,7 +661,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
@@ -311,6 +687,8 @@
     "    multitask_train_dataloader=multitask_train_dataloader,\n",
     "    dev_dataloaders_list=dev_dataloaders_list,\n",
     "    test_dataloaders_list=test_dataloaders_list,\n",
+    "    output_dir=OUTPUT_DIR,\n",
+    "    log_dir=LOG_DIR \n",
     ")"
    ]
   },
@@ -318,40 +696,461 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Fit on one epoch and predict using the training and test  \n",
+    "## Model Finetuning, Prediction and Evaluation\n",
+    "\n",
+    "### Fit and finetune model on five epochs and predict using the training and test  \n",
     "\n",
     "At this point the MT-DNN model allows us to fit to the model and create predictions. The fit takes an optional `epochs` parameter that overwrites the epochs set in the `MTDNNConfig` object. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/26/2020 08:14:07 - mtdnn.modeling_mtdnn - INFO - Total number of params: 109484547\n",
+      "06/26/2020 08:14:07 - mtdnn.modeling_mtdnn - INFO - At epoch 0\n",
+      "06/26/2020 08:14:07 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\n",
+      "06/26/2020 08:14:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [     1] Training Loss - [1.63923] Time Remaining - [1 day, 13:33:33]\n",
+      "06/26/2020 08:19:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [   500] Training Loss - [1.32204] Time Remaining - [4:25:55]\n",
+      "06/26/2020 08:25:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  1000] Training Loss - [1.21343] Time Remaining - [4:18:55]\n",
+      "06/26/2020 08:30:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  1500] Training Loss - [1.16369] Time Remaining - [4:13:52]\n",
+      "06/26/2020 08:36:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  2000] Training Loss - [1.12522] Time Remaining - [4:08:36]\n",
+      "06/26/2020 08:41:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  2500] Training Loss - [1.07541] Time Remaining - [4:03:18]\n",
+      "06/26/2020 08:47:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  3000] Training Loss - [1.03195] Time Remaining - [3:58:03]\n",
+      "06/26/2020 08:52:55 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  3500] Training Loss - [0.99050] Time Remaining - [3:52:26]\n",
+      "06/26/2020 08:58:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  4000] Training Loss - [0.95599] Time Remaining - [3:46:59]\n",
+      "06/26/2020 09:04:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  4500] Training Loss - [0.92721] Time Remaining - [3:41:34]\n",
+      "06/26/2020 09:09:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  5000] Training Loss - [0.90235] Time Remaining - [3:36:07]\n",
+      "06/26/2020 09:15:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  5500] Training Loss - [0.87961] Time Remaining - [3:30:33]\n",
+      "06/26/2020 09:20:41 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  6000] Training Loss - [0.85982] Time Remaining - [3:24:55]\n",
+      "06/26/2020 09:26:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  6500] Training Loss - [0.84107] Time Remaining - [3:19:24]\n",
+      "06/26/2020 09:31:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  7000] Training Loss - [0.82505] Time Remaining - [3:13:51]\n",
+      "06/26/2020 09:37:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  7500] Training Loss - [0.81009] Time Remaining - [3:08:25]\n",
+      "06/26/2020 09:42:55 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  8000] Training Loss - [0.79706] Time Remaining - [3:02:49]\n",
+      "06/26/2020 09:48:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  8500] Training Loss - [0.78522] Time Remaining - [2:57:13]\n",
+      "06/26/2020 09:54:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  9000] Training Loss - [0.77296] Time Remaining - [2:51:42]\n",
+      "06/26/2020 09:59:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [  9500] Training Loss - [0.76185] Time Remaining - [2:46:11]\n",
+      "06/26/2020 10:05:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 10000] Training Loss - [0.75168] Time Remaining - [2:40:42]\n",
+      "06/26/2020 10:10:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 10500] Training Loss - [0.74186] Time Remaining - [2:35:11]\n",
+      "06/26/2020 10:16:17 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 11000] Training Loss - [0.73347] Time Remaining - [2:29:37]\n",
+      "06/26/2020 10:21:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 11500] Training Loss - [0.72535] Time Remaining - [2:24:03]\n",
+      "06/26/2020 10:27:24 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 12000] Training Loss - [0.71798] Time Remaining - [2:18:30]\n",
+      "06/26/2020 10:32:56 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 12500] Training Loss - [0.71132] Time Remaining - [2:12:56]\n",
+      "06/26/2020 10:38:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 13000] Training Loss - [0.70462] Time Remaining - [2:07:23]\n",
+      "06/26/2020 10:44:02 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 13500] Training Loss - [0.69882] Time Remaining - [2:01:49]\n",
+      "06/26/2020 10:49:35 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 14000] Training Loss - [0.69229] Time Remaining - [1:56:16]\n",
+      "06/26/2020 10:55:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 14500] Training Loss - [0.68647] Time Remaining - [1:50:43]\n",
+      "06/26/2020 11:00:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 15000] Training Loss - [0.68061] Time Remaining - [1:45:10]\n",
+      "06/26/2020 11:06:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 15500] Training Loss - [0.67555] Time Remaining - [1:39:39]\n",
+      "06/26/2020 11:11:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 16000] Training Loss - [0.67038] Time Remaining - [1:34:05]\n",
+      "06/26/2020 11:17:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 16500] Training Loss - [0.66557] Time Remaining - [1:28:32]\n",
+      "06/26/2020 11:22:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 17000] Training Loss - [0.66106] Time Remaining - [1:22:57]\n",
+      "06/26/2020 11:28:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 17500] Training Loss - [0.65651] Time Remaining - [1:17:24]\n",
+      "06/26/2020 11:34:01 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 18000] Training Loss - [0.65221] Time Remaining - [1:11:51]\n",
+      "06/26/2020 11:39:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 18500] Training Loss - [0.64808] Time Remaining - [1:06:17]\n",
+      "06/26/2020 11:45:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 19000] Training Loss - [0.64444] Time Remaining - [1:00:44]\n",
+      "06/26/2020 11:50:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 19500] Training Loss - [0.64039] Time Remaining - [0:55:11]\n",
+      "06/26/2020 11:56:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 20000] Training Loss - [0.63708] Time Remaining - [0:49:38]\n",
+      "06/27/2020 12:01:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 20500] Training Loss - [0.63337] Time Remaining - [0:44:05]\n",
+      "06/27/2020 12:07:19 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 21000] Training Loss - [0.62972] Time Remaining - [0:38:32]\n",
+      "06/27/2020 12:12:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 21500] Training Loss - [0.62656] Time Remaining - [0:32:59]\n",
+      "06/27/2020 12:18:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 22000] Training Loss - [0.62311] Time Remaining - [0:27:26]\n",
+      "06/27/2020 12:23:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 22500] Training Loss - [0.62002] Time Remaining - [0:21:53]\n",
+      "06/27/2020 12:29:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 23000] Training Loss - [0.61681] Time Remaining - [0:16:20]\n",
+      "06/27/2020 12:35:04 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 23500] Training Loss - [0.61411] Time Remaining - [0:10:46]\n",
+      "06/27/2020 12:40:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 24000] Training Loss - [0.61127] Time Remaining - [0:05:13]\n",
+      "06/27/2020 12:45:48 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_0.pt\n",
+      "06/27/2020 12:45:50 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_0.pt\n",
+      "06/27/2020 12:45:50 - mtdnn.modeling_mtdnn - INFO - At epoch 1\n",
+      "06/27/2020 12:45:50 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\n",
+      "06/27/2020 12:46:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 24500] Training Loss - [0.60860] Time Remaining - [4:31:07]\n",
+      "06/27/2020 12:51:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 25000] Training Loss - [0.60618] Time Remaining - [4:27:29]\n",
+      "06/27/2020 12:57:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 25500] Training Loss - [0.60383] Time Remaining - [4:20:36]\n",
+      "06/27/2020 01:02:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 26000] Training Loss - [0.60122] Time Remaining - [4:15:02]\n",
+      "06/27/2020 01:08:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 26500] Training Loss - [0.59883] Time Remaining - [4:09:14]\n",
+      "06/27/2020 01:13:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 27000] Training Loss - [0.59667] Time Remaining - [4:03:36]\n",
+      "06/27/2020 01:19:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 27500] Training Loss - [0.59434] Time Remaining - [3:58:02]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/27/2020 01:25:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 28000] Training Loss - [0.59204] Time Remaining - [3:52:28]\n",
+      "06/27/2020 01:30:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 28500] Training Loss - [0.58952] Time Remaining - [3:46:57]\n",
+      "06/27/2020 01:36:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 29000] Training Loss - [0.58707] Time Remaining - [3:41:27]\n",
+      "06/27/2020 01:41:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 29500] Training Loss - [0.58480] Time Remaining - [3:35:47]\n",
+      "06/27/2020 01:47:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 30000] Training Loss - [0.58238] Time Remaining - [3:30:10]\n",
+      "06/27/2020 01:52:43 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 30500] Training Loss - [0.57984] Time Remaining - [3:24:34]\n",
+      "06/27/2020 01:58:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 31000] Training Loss - [0.57737] Time Remaining - [3:19:04]\n",
+      "06/27/2020 02:03:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 31500] Training Loss - [0.57507] Time Remaining - [3:13:25]\n",
+      "06/27/2020 02:09:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 32000] Training Loss - [0.57277] Time Remaining - [3:07:56]\n",
+      "06/27/2020 02:14:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 32500] Training Loss - [0.57034] Time Remaining - [3:02:20]\n",
+      "06/27/2020 02:20:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 33000] Training Loss - [0.56793] Time Remaining - [2:56:42]\n",
+      "06/27/2020 02:25:56 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 33500] Training Loss - [0.56548] Time Remaining - [2:51:11]\n",
+      "06/27/2020 02:31:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 34000] Training Loss - [0.56309] Time Remaining - [2:45:41]\n",
+      "06/27/2020 02:37:04 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 34500] Training Loss - [0.56059] Time Remaining - [2:40:11]\n",
+      "06/27/2020 02:42:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 35000] Training Loss - [0.55799] Time Remaining - [2:34:41]\n",
+      "06/27/2020 02:48:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 35500] Training Loss - [0.55566] Time Remaining - [2:29:07]\n",
+      "06/27/2020 02:53:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 36000] Training Loss - [0.55331] Time Remaining - [2:23:34]\n",
+      "06/27/2020 02:59:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 36500] Training Loss - [0.55091] Time Remaining - [2:18:02]\n",
+      "06/27/2020 03:04:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 37000] Training Loss - [0.54856] Time Remaining - [2:12:29]\n",
+      "06/27/2020 03:10:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 37500] Training Loss - [0.54628] Time Remaining - [2:06:55]\n",
+      "06/27/2020 03:15:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 38000] Training Loss - [0.54413] Time Remaining - [2:01:21]\n",
+      "06/27/2020 03:21:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 38500] Training Loss - [0.54178] Time Remaining - [1:55:49]\n",
+      "06/27/2020 03:27:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 39000] Training Loss - [0.53955] Time Remaining - [1:50:16]\n",
+      "06/27/2020 03:32:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 39500] Training Loss - [0.53732] Time Remaining - [1:44:42]\n",
+      "06/27/2020 03:38:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 40000] Training Loss - [0.53530] Time Remaining - [1:39:11]\n",
+      "06/27/2020 03:43:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 40500] Training Loss - [0.53318] Time Remaining - [1:33:38]\n",
+      "06/27/2020 03:49:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 41000] Training Loss - [0.53105] Time Remaining - [1:28:05]\n",
+      "06/27/2020 03:54:41 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 41500] Training Loss - [0.52908] Time Remaining - [1:22:32]\n",
+      "06/27/2020 04:00:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 42000] Training Loss - [0.52711] Time Remaining - [1:16:59]\n",
+      "06/27/2020 04:05:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 42500] Training Loss - [0.52516] Time Remaining - [1:11:26]\n",
+      "06/27/2020 04:11:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 43000] Training Loss - [0.52324] Time Remaining - [1:05:53]\n",
+      "06/27/2020 04:16:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 43500] Training Loss - [0.52161] Time Remaining - [1:00:20]\n",
+      "06/27/2020 04:22:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 44000] Training Loss - [0.51970] Time Remaining - [0:54:48]\n",
+      "06/27/2020 04:27:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 44500] Training Loss - [0.51821] Time Remaining - [0:49:14]\n",
+      "06/27/2020 04:33:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 45000] Training Loss - [0.51635] Time Remaining - [0:43:42]\n",
+      "06/27/2020 04:39:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 45500] Training Loss - [0.51451] Time Remaining - [0:38:09]\n",
+      "06/27/2020 04:44:33 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 46000] Training Loss - [0.51286] Time Remaining - [0:32:37]\n",
+      "06/27/2020 04:50:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 46500] Training Loss - [0.51112] Time Remaining - [0:27:04]\n",
+      "06/27/2020 04:55:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 47000] Training Loss - [0.50952] Time Remaining - [0:21:31]\n",
+      "06/27/2020 05:01:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 47500] Training Loss - [0.50789] Time Remaining - [0:15:59]\n",
+      "06/27/2020 05:06:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 48000] Training Loss - [0.50631] Time Remaining - [0:10:26]\n",
+      "06/27/2020 05:12:12 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 48500] Training Loss - [0.50469] Time Remaining - [0:04:53]\n",
+      "06/27/2020 05:17:06 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_1.pt\n",
+      "06/27/2020 05:17:07 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_1.pt\n",
+      "06/27/2020 05:17:07 - mtdnn.modeling_mtdnn - INFO - At epoch 2\n",
+      "06/27/2020 05:17:07 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\n",
+      "06/27/2020 05:17:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 49000] Training Loss - [0.50317] Time Remaining - [4:33:15]\n",
+      "06/27/2020 05:23:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 49500] Training Loss - [0.50171] Time Remaining - [4:26:45]\n",
+      "06/27/2020 05:28:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 50000] Training Loss - [0.50034] Time Remaining - [4:20:18]\n",
+      "06/27/2020 05:34:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 50500] Training Loss - [0.49876] Time Remaining - [4:14:39]\n",
+      "06/27/2020 05:39:58 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 51000] Training Loss - [0.49731] Time Remaining - [4:08:48]\n",
+      "06/27/2020 05:45:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 51500] Training Loss - [0.49601] Time Remaining - [4:03:18]\n",
+      "06/27/2020 05:51:06 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 52000] Training Loss - [0.49468] Time Remaining - [3:57:54]\n",
+      "06/27/2020 05:56:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 52500] Training Loss - [0.49328] Time Remaining - [3:52:27]\n",
+      "06/27/2020 06:02:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 53000] Training Loss - [0.49179] Time Remaining - [3:46:53]\n",
+      "06/27/2020 06:07:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 53500] Training Loss - [0.49036] Time Remaining - [3:41:24]\n",
+      "06/27/2020 06:13:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 54000] Training Loss - [0.48902] Time Remaining - [3:35:48]\n",
+      "06/27/2020 06:18:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 54500] Training Loss - [0.48761] Time Remaining - [3:30:10]\n",
+      "06/27/2020 06:24:25 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 55000] Training Loss - [0.48609] Time Remaining - [3:24:32]\n",
+      "06/27/2020 06:29:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 55500] Training Loss - [0.48458] Time Remaining - [3:19:02]\n",
+      "06/27/2020 06:35:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 56000] Training Loss - [0.48321] Time Remaining - [3:13:27]\n",
+      "06/27/2020 06:41:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 56500] Training Loss - [0.48176] Time Remaining - [3:07:58]\n",
+      "06/27/2020 06:46:38 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 57000] Training Loss - [0.48029] Time Remaining - [3:02:19]\n",
+      "06/27/2020 06:52:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 57500] Training Loss - [0.47890] Time Remaining - [2:56:41]\n",
+      "06/27/2020 06:57:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 58000] Training Loss - [0.47741] Time Remaining - [2:51:11]\n",
+      "06/27/2020 07:03:17 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 58500] Training Loss - [0.47591] Time Remaining - [2:45:38]\n",
+      "06/27/2020 07:08:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 59000] Training Loss - [0.47436] Time Remaining - [2:40:06]\n",
+      "06/27/2020 07:14:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 59500] Training Loss - [0.47282] Time Remaining - [2:34:34]\n",
+      "06/27/2020 07:19:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 60000] Training Loss - [0.47137] Time Remaining - [2:29:01]\n",
+      "06/27/2020 07:25:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 60500] Training Loss - [0.46989] Time Remaining - [2:23:27]\n",
+      "06/27/2020 07:31:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 61000] Training Loss - [0.46844] Time Remaining - [2:17:54]\n",
+      "06/27/2020 07:36:38 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 61500] Training Loss - [0.46691] Time Remaining - [2:12:20]\n",
+      "06/27/2020 07:42:12 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 62000] Training Loss - [0.46547] Time Remaining - [2:06:48]\n",
+      "06/27/2020 07:47:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 62500] Training Loss - [0.46406] Time Remaining - [2:01:14]\n",
+      "06/27/2020 07:53:17 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 63000] Training Loss - [0.46261] Time Remaining - [1:55:40]\n",
+      "06/27/2020 07:58:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 63500] Training Loss - [0.46117] Time Remaining - [1:50:06]\n",
+      "06/27/2020 08:04:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 64000] Training Loss - [0.45977] Time Remaining - [1:44:33]\n",
+      "06/27/2020 08:10:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 64500] Training Loss - [0.45842] Time Remaining - [1:39:02]\n",
+      "06/27/2020 08:15:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 65000] Training Loss - [0.45711] Time Remaining - [1:33:28]\n",
+      "06/27/2020 08:21:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 65500] Training Loss - [0.45574] Time Remaining - [1:27:54]\n",
+      "06/27/2020 08:26:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 66000] Training Loss - [0.45438] Time Remaining - [1:22:20]\n",
+      "06/27/2020 08:32:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 66500] Training Loss - [0.45316] Time Remaining - [1:16:47]\n",
+      "06/27/2020 08:37:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 67000] Training Loss - [0.45187] Time Remaining - [1:11:14]\n",
+      "06/27/2020 08:43:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 67500] Training Loss - [0.45054] Time Remaining - [1:05:40]\n",
+      "06/27/2020 08:48:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 68000] Training Loss - [0.44935] Time Remaining - [1:00:07]\n",
+      "06/27/2020 08:54:20 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 68500] Training Loss - [0.44811] Time Remaining - [0:54:33]\n",
+      "06/27/2020 08:59:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 69000] Training Loss - [0.44706] Time Remaining - [0:49:00]\n",
+      "06/27/2020 09:05:25 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 69500] Training Loss - [0.44582] Time Remaining - [0:43:27]\n",
+      "06/27/2020 09:10:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 70000] Training Loss - [0.44460] Time Remaining - [0:37:54]\n",
+      "06/27/2020 09:16:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 70500] Training Loss - [0.44340] Time Remaining - [0:32:21]\n",
+      "06/27/2020 09:22:04 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 71000] Training Loss - [0.44224] Time Remaining - [0:26:47]\n",
+      "06/27/2020 09:27:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 71500] Training Loss - [0.44109] Time Remaining - [0:21:14]\n",
+      "06/27/2020 09:33:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 72000] Training Loss - [0.43992] Time Remaining - [0:15:41]\n",
+      "06/27/2020 09:38:43 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 72500] Training Loss - [0.43884] Time Remaining - [0:10:08]\n",
+      "06/27/2020 09:44:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 73000] Training Loss - [0.43772] Time Remaining - [0:04:35]\n",
+      "06/27/2020 09:48:52 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_2.pt\n",
+      "06/27/2020 09:48:53 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_2.pt\n",
+      "06/27/2020 09:48:53 - mtdnn.modeling_mtdnn - INFO - At epoch 3\n",
+      "06/27/2020 09:48:53 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\n",
+      "06/27/2020 09:49:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 73500] Training Loss - [0.43667] Time Remaining - [4:31:14]\n",
+      "06/27/2020 09:55:24 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 74000] Training Loss - [0.43569] Time Remaining - [4:24:58]\n",
+      "06/27/2020 10:00:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 74500] Training Loss - [0.43456] Time Remaining - [4:18:35]\n",
+      "06/27/2020 10:06:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 75000] Training Loss - [0.43348] Time Remaining - [4:13:25]\n",
+      "06/27/2020 10:12:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 75500] Training Loss - [0.43240] Time Remaining - [4:07:56]\n",
+      "06/27/2020 10:17:31 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 76000] Training Loss - [0.43145] Time Remaining - [4:02:12]\n",
+      "06/27/2020 10:23:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 76500] Training Loss - [0.43042] Time Remaining - [3:56:40]\n",
+      "06/27/2020 10:28:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 77000] Training Loss - [0.42942] Time Remaining - [3:51:12]\n",
+      "06/27/2020 10:34:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 77500] Training Loss - [0.42829] Time Remaining - [3:45:45]\n",
+      "06/27/2020 10:39:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 78000] Training Loss - [0.42727] Time Remaining - [3:40:23]\n",
+      "06/27/2020 10:45:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 78500] Training Loss - [0.42634] Time Remaining - [3:34:42]\n",
+      "06/27/2020 10:50:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 79000] Training Loss - [0.42530] Time Remaining - [3:29:08]\n",
+      "06/27/2020 10:56:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 79500] Training Loss - [0.42421] Time Remaining - [3:23:36]\n",
+      "06/27/2020 11:01:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 80000] Training Loss - [0.42316] Time Remaining - [3:18:07]\n",
+      "06/27/2020 11:07:23 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 80500] Training Loss - [0.42214] Time Remaining - [3:12:31]\n",
+      "06/27/2020 11:12:56 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 81000] Training Loss - [0.42110] Time Remaining - [3:07:02]\n",
+      "06/27/2020 11:18:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 81500] Training Loss - [0.42001] Time Remaining - [3:01:29]\n",
+      "06/27/2020 11:23:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 82000] Training Loss - [0.41902] Time Remaining - [2:55:54]\n",
+      "06/27/2020 11:29:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 82500] Training Loss - [0.41800] Time Remaining - [2:50:20]\n",
+      "06/27/2020 11:35:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 83000] Training Loss - [0.41688] Time Remaining - [2:44:49]\n",
+      "06/27/2020 11:40:36 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 83500] Training Loss - [0.41583] Time Remaining - [2:39:18]\n",
+      "06/27/2020 11:46:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 84000] Training Loss - [0.41472] Time Remaining - [2:33:45]\n",
+      "06/27/2020 11:51:38 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 84500] Training Loss - [0.41364] Time Remaining - [2:28:10]\n",
+      "06/27/2020 11:57:10 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 85000] Training Loss - [0.41259] Time Remaining - [2:22:38]\n",
+      "06/27/2020 12:02:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 85500] Training Loss - [0.41152] Time Remaining - [2:17:06]\n",
+      "06/27/2020 12:08:14 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 86000] Training Loss - [0.41049] Time Remaining - [2:11:34]\n",
+      "06/27/2020 12:13:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 86500] Training Loss - [0.40944] Time Remaining - [2:05:59]\n",
+      "06/27/2020 12:19:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 87000] Training Loss - [0.40839] Time Remaining - [2:00:27]\n",
+      "06/27/2020 12:24:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 87500] Training Loss - [0.40739] Time Remaining - [1:54:55]\n",
+      "06/27/2020 12:30:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 88000] Training Loss - [0.40638] Time Remaining - [1:49:24]\n",
+      "06/27/2020 12:35:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 88500] Training Loss - [0.40539] Time Remaining - [1:43:51]\n",
+      "06/27/2020 12:41:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 89000] Training Loss - [0.40443] Time Remaining - [1:38:21]\n",
+      "06/27/2020 12:47:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 89500] Training Loss - [0.40348] Time Remaining - [1:32:49]\n",
+      "06/27/2020 12:52:33 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 90000] Training Loss - [0.40242] Time Remaining - [1:27:17]\n",
+      "06/27/2020 12:58:02 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 90500] Training Loss - [0.40147] Time Remaining - [1:21:44]\n",
+      "06/27/2020 01:03:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 91000] Training Loss - [0.40057] Time Remaining - [1:16:12]\n",
+      "06/27/2020 01:09:05 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 91500] Training Loss - [0.39961] Time Remaining - [1:10:39]\n",
+      "06/27/2020 01:14:35 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 92000] Training Loss - [0.39874] Time Remaining - [1:05:07]\n",
+      "06/27/2020 01:20:06 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 92500] Training Loss - [0.39783] Time Remaining - [0:59:34]\n",
+      "06/27/2020 01:25:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 93000] Training Loss - [0.39692] Time Remaining - [0:54:02]\n",
+      "06/27/2020 01:31:09 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 93500] Training Loss - [0.39617] Time Remaining - [0:48:30]\n",
+      "06/27/2020 01:36:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 94000] Training Loss - [0.39524] Time Remaining - [0:42:58]\n",
+      "06/27/2020 01:42:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 94500] Training Loss - [0.39436] Time Remaining - [0:37:26]\n",
+      "06/27/2020 01:47:46 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 95000] Training Loss - [0.39353] Time Remaining - [0:31:54]\n",
+      "06/27/2020 01:53:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 95500] Training Loss - [0.39261] Time Remaining - [0:26:22]\n",
+      "06/27/2020 01:58:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 96000] Training Loss - [0.39182] Time Remaining - [0:20:50]\n",
+      "06/27/2020 02:04:20 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 96500] Training Loss - [0.39099] Time Remaining - [0:15:18]\n",
+      "06/27/2020 02:09:51 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 97000] Training Loss - [0.39020] Time Remaining - [0:09:46]\n",
+      "06/27/2020 02:15:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 97500] Training Loss - [0.38940] Time Remaining - [0:04:14]\n",
+      "06/27/2020 02:19:37 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_3.pt\n",
+      "06/27/2020 02:19:38 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_3.pt\n",
+      "06/27/2020 02:19:38 - mtdnn.modeling_mtdnn - INFO - At epoch 4\n",
+      "06/27/2020 02:19:38 - mtdnn.modeling_mtdnn - INFO - Amount of data to go over: 24471\n",
+      "06/27/2020 02:20:57 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 98000] Training Loss - [0.38866] Time Remaining - [4:36:08]\n",
+      "06/27/2020 02:26:30 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 98500] Training Loss - [0.38793] Time Remaining - [4:26:03]\n",
+      "06/27/2020 02:32:03 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 99000] Training Loss - [0.38710] Time Remaining - [4:20:01]\n",
+      "06/27/2020 02:37:37 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [ 99500] Training Loss - [0.38627] Time Remaining - [4:14:15]\n",
+      "06/27/2020 02:43:11 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [100000] Training Loss - [0.38549] Time Remaining - [4:08:45]\n",
+      "06/27/2020 02:48:44 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [100500] Training Loss - [0.38482] Time Remaining - [4:03:04]\n",
+      "06/27/2020 02:54:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [101000] Training Loss - [0.38410] Time Remaining - [3:57:37]\n",
+      "06/27/2020 02:59:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [101500] Training Loss - [0.38333] Time Remaining - [3:52:06]\n",
+      "06/27/2020 03:05:27 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [102000] Training Loss - [0.38250] Time Remaining - [3:46:33]\n",
+      "06/27/2020 03:11:01 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [102500] Training Loss - [0.38174] Time Remaining - [3:41:01]\n",
+      "06/27/2020 03:16:34 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [103000] Training Loss - [0.38101] Time Remaining - [3:35:23]\n",
+      "06/27/2020 03:22:06 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [103500] Training Loss - [0.38023] Time Remaining - [3:29:44]\n",
+      "06/27/2020 03:27:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [104000] Training Loss - [0.37941] Time Remaining - [3:24:10]\n",
+      "06/27/2020 03:33:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [104500] Training Loss - [0.37866] Time Remaining - [3:18:34]\n",
+      "06/27/2020 03:38:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [105000] Training Loss - [0.37794] Time Remaining - [3:12:57]\n",
+      "06/27/2020 03:44:19 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [105500] Training Loss - [0.37717] Time Remaining - [3:07:24]\n",
+      "06/27/2020 03:49:49 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [106000] Training Loss - [0.37640] Time Remaining - [3:01:44]\n",
+      "06/27/2020 03:55:22 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [106500] Training Loss - [0.37562] Time Remaining - [2:56:09]\n",
+      "06/27/2020 04:00:54 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [107000] Training Loss - [0.37492] Time Remaining - [2:50:34]\n",
+      "06/27/2020 04:06:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [107500] Training Loss - [0.37413] Time Remaining - [2:45:01]\n",
+      "06/27/2020 04:12:01 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [108000] Training Loss - [0.37329] Time Remaining - [2:39:28]\n",
+      "06/27/2020 04:17:35 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [108500] Training Loss - [0.37250] Time Remaining - [2:33:56]\n",
+      "06/27/2020 04:23:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [109000] Training Loss - [0.37170] Time Remaining - [2:28:21]\n",
+      "06/27/2020 04:28:42 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [109500] Training Loss - [0.37095] Time Remaining - [2:22:50]\n",
+      "06/27/2020 04:34:15 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [110000] Training Loss - [0.37015] Time Remaining - [2:17:16]\n",
+      "06/27/2020 04:39:48 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [110500] Training Loss - [0.36936] Time Remaining - [2:11:43]\n",
+      "06/27/2020 04:45:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [111000] Training Loss - [0.36862] Time Remaining - [2:06:06]\n",
+      "06/27/2020 04:50:52 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [111500] Training Loss - [0.36786] Time Remaining - [2:00:34]\n",
+      "06/27/2020 04:56:24 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [112000] Training Loss - [0.36715] Time Remaining - [1:54:59]\n",
+      "06/27/2020 05:01:59 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [112500] Training Loss - [0.36641] Time Remaining - [1:49:28]\n",
+      "06/27/2020 05:07:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [113000] Training Loss - [0.36561] Time Remaining - [1:43:54]\n",
+      "06/27/2020 05:13:08 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [113500] Training Loss - [0.36493] Time Remaining - [1:38:22]\n",
+      "06/27/2020 05:18:39 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [114000] Training Loss - [0.36422] Time Remaining - [1:32:48]\n",
+      "06/27/2020 05:24:13 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [114500] Training Loss - [0.36346] Time Remaining - [1:27:15]\n",
+      "06/27/2020 05:29:45 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [115000] Training Loss - [0.36276] Time Remaining - [1:21:41]\n",
+      "06/27/2020 05:35:18 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [115500] Training Loss - [0.36208] Time Remaining - [1:16:08]\n",
+      "06/27/2020 05:40:50 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [116000] Training Loss - [0.36137] Time Remaining - [1:10:34]\n",
+      "06/27/2020 05:46:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [116500] Training Loss - [0.36070] Time Remaining - [1:05:00]\n",
+      "06/27/2020 05:51:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [117000] Training Loss - [0.36006] Time Remaining - [0:59:27]\n",
+      "06/27/2020 05:57:28 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [117500] Training Loss - [0.35940] Time Remaining - [0:53:54]\n",
+      "06/27/2020 06:03:00 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [118000] Training Loss - [0.35885] Time Remaining - [0:48:21]\n",
+      "06/27/2020 06:08:32 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [118500] Training Loss - [0.35815] Time Remaining - [0:42:48]\n",
+      "06/27/2020 06:14:07 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [119000] Training Loss - [0.35751] Time Remaining - [0:37:15]\n",
+      "06/27/2020 06:19:40 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [119500] Training Loss - [0.35685] Time Remaining - [0:31:42]\n",
+      "06/27/2020 06:25:16 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [120000] Training Loss - [0.35616] Time Remaining - [0:26:09]\n",
+      "06/27/2020 06:30:47 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [120500] Training Loss - [0.35558] Time Remaining - [0:20:35]\n",
+      "06/27/2020 06:36:21 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [121000] Training Loss - [0.35499] Time Remaining - [0:15:02]\n",
+      "06/27/2020 06:41:53 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [121500] Training Loss - [0.35438] Time Remaining - [0:09:29]\n",
+      "06/27/2020 06:47:26 - mtdnn.modeling_mtdnn - INFO - Task - [ 0] Updates - [122000] Training Loss - [0.35374] Time Remaining - [0:03:56]\n",
+      "06/27/2020 06:51:23 - mtdnn.modeling_mtdnn - INFO - Saving mt-dnn model to /tmp/tmpd9ok4aeo/checkpoint/model_4.pt\n",
+      "06/27/2020 06:51:24 - mtdnn.modeling_mtdnn - INFO - model saved to /tmp/tmpd9ok4aeo/checkpoint/model_4.pt\n"
+     ]
+    }
+   ],
    "source": [
-    "model.fit(epoch=1)\n",
-    "model.predict()"
+    "model.fit(epochs=NUM_EPOCHS)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Obtain predictions with a previously trained model checkpoint"
+    "### Evaluation and Prediction\n",
+    "Perform inference using the last (best) checkpointed model. With 5 epochs, the last model would be `model_4.pt`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "06/27/2020 06:51:24 - mtdnn.modeling_mtdnn - INFO - Running predictions using: /tmp/tmpd9ok4aeo/checkpoint/model_4.pt\n",
+      "06/27/2020 06:51:25 - mtdnn.modeling_mtdnn - INFO - predicting 0\n",
+      "06/27/2020 06:51:45 - mtdnn.modeling_mtdnn - INFO - predicting 100\n",
+      "06/27/2020 06:52:05 - mtdnn.modeling_mtdnn - INFO - predicting 200\n",
+      "06/27/2020 06:52:27 - mtdnn.modeling_mtdnn - INFO - predicting 300\n",
+      "06/27/2020 06:52:47 - mtdnn.modeling_mtdnn - INFO - predicting 400\n",
+      "06/27/2020 06:53:07 - mtdnn.modeling_mtdnn - INFO - predicting 500\n",
+      "06/27/2020 06:53:28 - mtdnn.modeling_mtdnn - INFO - predicting 600\n",
+      "06/27/2020 06:53:48 - mtdnn.modeling_mtdnn - INFO - predicting 700\n",
+      "06/27/2020 06:54:10 - mtdnn.modeling_mtdnn - INFO - predicting 800\n",
+      "06/27/2020 06:54:30 - mtdnn.modeling_mtdnn - INFO - predicting 900\n",
+      "06/27/2020 06:54:50 - mtdnn.modeling_mtdnn - INFO - predicting 1000\n",
+      "06/27/2020 06:55:11 - mtdnn.modeling_mtdnn - INFO - predicting 1100\n",
+      "06/27/2020 06:55:31 - mtdnn.modeling_mtdnn - INFO - predicting 1200\n",
+      "06/27/2020 06:55:37 - mtdnn.modeling_mtdnn - INFO - Task mnli_mismatched -- epoch 0 -- Dev ACC: 84.422\n",
+      "06/27/2020 06:55:37 - mtdnn.modeling_mtdnn - INFO - predicting 0\n",
+      "06/27/2020 06:55:59 - mtdnn.modeling_mtdnn - INFO - predicting 100\n",
+      "06/27/2020 06:56:19 - mtdnn.modeling_mtdnn - INFO - predicting 200\n",
+      "06/27/2020 06:56:39 - mtdnn.modeling_mtdnn - INFO - predicting 300\n",
+      "06/27/2020 06:57:00 - mtdnn.modeling_mtdnn - INFO - predicting 400\n",
+      "06/27/2020 06:57:21 - mtdnn.modeling_mtdnn - INFO - predicting 500\n",
+      "06/27/2020 06:57:42 - mtdnn.modeling_mtdnn - INFO - predicting 600\n",
+      "06/27/2020 06:58:02 - mtdnn.modeling_mtdnn - INFO - predicting 700\n",
+      "06/27/2020 06:58:22 - mtdnn.modeling_mtdnn - INFO - predicting 800\n",
+      "06/27/2020 06:58:42 - mtdnn.modeling_mtdnn - INFO - predicting 900\n",
+      "06/27/2020 06:59:04 - mtdnn.modeling_mtdnn - INFO - predicting 1000\n",
+      "06/27/2020 06:59:24 - mtdnn.modeling_mtdnn - INFO - predicting 1100\n",
+      "06/27/2020 06:59:45 - mtdnn.modeling_mtdnn - INFO - predicting 1200\n",
+      "06/27/2020 06:59:50 - mtdnn.modeling_mtdnn - INFO - [new test scores saved.]\n",
+      "06/27/2020 06:59:50 - mtdnn.modeling_mtdnn - INFO - predicting 0\n",
+      "06/27/2020 07:00:10 - mtdnn.modeling_mtdnn - INFO - predicting 100\n",
+      "06/27/2020 07:00:30 - mtdnn.modeling_mtdnn - INFO - predicting 200\n",
+      "06/27/2020 07:00:52 - mtdnn.modeling_mtdnn - INFO - predicting 300\n",
+      "06/27/2020 07:01:12 - mtdnn.modeling_mtdnn - INFO - predicting 400\n",
+      "06/27/2020 07:01:32 - mtdnn.modeling_mtdnn - INFO - predicting 500\n",
+      "06/27/2020 07:01:52 - mtdnn.modeling_mtdnn - INFO - predicting 600\n",
+      "06/27/2020 07:02:14 - mtdnn.modeling_mtdnn - INFO - predicting 700\n",
+      "06/27/2020 07:02:34 - mtdnn.modeling_mtdnn - INFO - predicting 800\n",
+      "06/27/2020 07:02:55 - mtdnn.modeling_mtdnn - INFO - predicting 900\n",
+      "06/27/2020 07:03:15 - mtdnn.modeling_mtdnn - INFO - predicting 1000\n",
+      "06/27/2020 07:03:35 - mtdnn.modeling_mtdnn - INFO - predicting 1100\n",
+      "06/27/2020 07:03:57 - mtdnn.modeling_mtdnn - INFO - predicting 1200\n",
+      "06/27/2020 07:04:03 - mtdnn.modeling_mtdnn - INFO - Task mnli_matched -- epoch 0 -- Dev ACC: 84.144\n",
+      "06/27/2020 07:04:03 - mtdnn.modeling_mtdnn - INFO - predicting 0\n",
+      "06/27/2020 07:04:23 - mtdnn.modeling_mtdnn - INFO - predicting 100\n",
+      "06/27/2020 07:04:43 - mtdnn.modeling_mtdnn - INFO - predicting 200\n",
+      "06/27/2020 07:05:04 - mtdnn.modeling_mtdnn - INFO - predicting 300\n",
+      "06/27/2020 07:05:24 - mtdnn.modeling_mtdnn - INFO - predicting 400\n",
+      "06/27/2020 07:05:45 - mtdnn.modeling_mtdnn - INFO - predicting 500\n",
+      "06/27/2020 07:06:06 - mtdnn.modeling_mtdnn - INFO - predicting 600\n",
+      "06/27/2020 07:06:26 - mtdnn.modeling_mtdnn - INFO - predicting 700\n",
+      "06/27/2020 07:06:46 - mtdnn.modeling_mtdnn - INFO - predicting 800\n",
+      "06/27/2020 07:07:08 - mtdnn.modeling_mtdnn - INFO - predicting 900\n",
+      "06/27/2020 07:07:28 - mtdnn.modeling_mtdnn - INFO - predicting 1000\n",
+      "06/27/2020 07:07:48 - mtdnn.modeling_mtdnn - INFO - predicting 1100\n",
+      "06/27/2020 07:08:09 - mtdnn.modeling_mtdnn - INFO - predicting 1200\n",
+      "06/27/2020 07:08:15 - mtdnn.modeling_mtdnn - INFO - [new test scores saved.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.predict(trained_model_chckpt=f\"{OUTPUT_DIR}/model_4.pt\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The predict function can take an optional checkpoint, `trained_model_chckpt`. This can be used for inference and running evaluations on an already trained PyTorch MT-DNN model.  \n",
-    "Optionally using a previously trained model as checkpoint.  \n",
-    "\n",
-    "```Python\n",
-    "# Predict using a MT-DNN model checkpoint\n",
-    "checkpt = \"<path_to_existing_model_checkpoint>\"\n",
-    "model.predict(trained_model_chckpt=checkpt)\n",
-    "```"
+    "### Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Mnli Mismatched Dev</th>\n",
+       "      <th>Mnli Matched Dev</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>ACCURACY</th>\n",
+       "      <td>84.422</td>\n",
+       "      <td>84.144</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Mnli Mismatched Dev Mnli Matched Dev\n",
+       "ACCURACY              84.422           84.144"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = {}\n",
+    "dev_result_files = list(filter(lambda x: x.endswith('.json') and 'dev' in x, os.listdir(OUTPUT_DIR))) \n",
+    "for d in dev_result_files: \n",
+    "    name =  ' '.join(list(map(str.capitalize, d.split('_')))[:3]) \n",
+    "    file_name = os.path.join(OUTPUT_DIR, d)\n",
+    "    with open(file_name, 'r') as f: \n",
+    "        res = json.load(f) \n",
+    "        results.update(\n",
+    "            {name: {\n",
+    "                'ACCURACY': f\"{res['metrics']['ACC']:.3f}\"\n",
+    "                }\n",
+    "            }) \n",
+    "df_results = pd.DataFrame(results)   \n",
+    "df_results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Clean up temporary folders"
    ]
   },
   {
@@ -359,7 +1158,10 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "if os.path.exists(ROOT_DIR):\n",
+    "    shutil.rmtree(ROOT_DIR, ignore_errors=True)"
+   ]
   }
  ],
  "metadata": {
@@ -382,5 +1184,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/mtdnn/common/san.py b/mtdnn/common/san.py
index 673a865..2ca87f0 100644
--- a/mtdnn/common/san.py
+++ b/mtdnn/common/san.py
@@ -288,3 +288,21 @@ def _generate_tasks_decoding_scoring_options(self):
                 else:
                     out_proj = nn.Linear(self.hidden_size, task_num_labels)
             self.scoring_list.append(out_proj)
+
+
+class MaskLmHeader(nn.Module):
+    """Mask LM
+    """
+
+    def __init__(self, embedding_weights=None, bias=False):
+        super(MaskLmHeader, self).__init__()
+        self.decoder = nn.Linear(
+            embedding_weights.size(1), embedding_weights.size(0), bias=bias
+        )
+        self.decoder.weight = embedding_weights
+        self.nsp = nn.Linear(embedding_weights.size(1), 2)
+
+    def forward(self, hidden_states):
+        mlm_out = self.decoder(hidden_states)
+        nsp_out = self.nsp(hidden_states[:, 0, :])
+        return mlm_out, nsp_out
diff --git a/mtdnn/common/san_model.py b/mtdnn/common/san_model.py
new file mode 100644
index 0000000..ade7bc9
--- /dev/null
+++ b/mtdnn/common/san_model.py
@@ -0,0 +1,141 @@
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorch_pretrained_bert.modeling import BertConfig, BertEmbeddings, BertLayerNorm
+from torch.nn.parameter import Parameter
+
+from mtdnn.common.dropout_wrapper import DropoutWrapper
+from mtdnn.common.similarity import SelfAttnWrapper
+
+
+class SanLayer(nn.Module):
+    def __init__(self, num_hid, bidirect, dropout, rnn_type):
+        super().__init__()
+
+        assert isinstance(rnn_type, str)
+        rnn_type = rnn_type.upper()
+        assert rnn_type == "LSTM" or rnn_type == "GRU"
+        rnn_cls = getattr(nn, rnn_type)
+        self._rnn = rnn_cls(
+            num_hid,
+            num_hid,
+            1,
+            bidirectional=bidirect,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self._layer_norm = BertLayerNorm(num_hid, eps=1e-12)
+        self.rnn_type = rnn_type
+        self.num_hid = num_hid
+        self.ndirections = 1 + int(bidirect)
+
+    def init_hidden(self, batch):
+        weight = next(self.parameters()).data
+        hid_shape = (self.ndirections, batch, self.num_hid)
+        if self.rnn_type == "LSTM":
+            return (weight.new(*hid_shape).zero_(), weight.new(*hid_shape).zero_())
+        else:
+            return weight.new(*hid_shape).zero_()
+
+    def forward(self, x, attention_mask):
+        # x: [batch, sequence, in_dim]
+        self._rnn.flatten_parameters()
+
+        batch = x.size(0)
+        hidden0 = self.init_hidden(batch)
+
+        tmp_output = self._rnn(x, hidden0)[0]
+        if self.ndirections > 1:
+            size = tmp_output.shape
+            tmp_output = tmp_output.view(size[0], size[1], self.num_hid, 2).max(-1)[0]
+        output = self._layer_norm(x + tmp_output)
+        return output
+
+
+class SanEncoder(nn.Module):
+    def __init__(self, num_hid, nlayers, bidirect, dropout, rnn_type="LSTM"):
+        super().__init__()
+        layer = SanLayer(num_hid, bidirect, dropout, rnn_type)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(nlayers)])
+
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class SanPooler(nn.Module):
+    def __init__(self, hidden_size, dropout_p):
+        super().__init__()
+        my_dropout = DropoutWrapper(dropout_p, False)
+        self.self_att = SelfAttnWrapper(hidden_size, dropout=my_dropout)
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states, attention_mask):
+        """
+        
+        Arguments:
+            hidden_states {FloatTensor} -- shape (batch, seq_len, hidden_size)
+            attention_mask {ByteTensor} -- 1 indicates padded token
+        """
+        first_token_tensor = self.self_att(hidden_states, attention_mask)
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SanModel(nn.Module):
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = SanEncoder(
+            config.hidden_size,
+            config.num_hidden_layers,
+            True,
+            config.hidden_dropout_prob,
+        )
+        self.pooler = SanPooler(config.hidden_size, config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        output_all_encoded_layers=True,
+    ):
+        """[summary]
+        
+        Arguments:
+            input_ids {LongTensor} -- shape [batch_size, seq_len]
+        
+        Keyword Arguments:
+            token_type_ids {LongTensor} -- shape [batch_size, seq_len]
+            attention_mask {LongTensor} -- 0 indicates padding tokens
+        
+        Returns: Tuple of (sequence_output, pooled_output)
+        """
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+        )
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output, attention_mask == 0)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
diff --git a/mtdnn/common/squad_prepro.py b/mtdnn/common/squad_prepro.py
index 468369b..1f20202 100644
--- a/mtdnn/common/squad_prepro.py
+++ b/mtdnn/common/squad_prepro.py
@@ -1,104 +1,131 @@
-import os
 import argparse
-from sys import path
 import json
-path.append(os.getcwd())
-from data_utils.log_wrapper import create_logger
-from experiments.common_utils import dump_rows
-from data_utils import DataFormat
+import os
+from sys import path
+
+from mtdnn.common.types import DataFormat
+from mtdnn.common.utils import MTDNNCommonUtils
+from mtdnn.tasks.utils import process_data_and_dump_rows
+from mtdnn.common.vocab import END
+
+# path.append(os.getcwd())
+
+
+logger = MTDNNCommonUtils.setup_logging(filename="squad_prepro.log")
 
-logger = create_logger(__name__, to_disk=True, log_file='squad_prepro.log')
 
 def normalize_qa_field(s: str, replacement_list):
     for replacement in replacement_list:
-        s = s.replace(replacement, " " * len(replacement))  # ensure answer_start and answer_end still valid
+        s = s.replace(
+            replacement, " " * len(replacement)
+        )  # ensure answer_start and answer_end still valid
     return s
 
-#END = 'EOSEOS'
+
+# END = 'EOSEOS'
 def load_data(path, is_train=True, v2_on=False):
     rows = []
     with open(path, encoding="utf8") as f:
-        data = json.load(f)['data']
+        data = json.load(f)["data"]
     for article in data:
-        for paragraph in article['paragraphs']:
-            context = paragraph['context']
+        for paragraph in article["paragraphs"]:
+            context = paragraph["context"]
             if v2_on:
-                context = '{} {}'.format(context, END)
-            for qa in paragraph['qas']:
-                uid, question = qa['id'], qa['question']
-                answers = qa.get('answers', [])
+                context = "{} {}".format(context, END)
+            for qa in paragraph["qas"]:
+                uid, question = qa["id"], qa["question"]
+                answers = qa.get("answers", [])
                 # used for v2.0
-                is_impossible = qa.get('is_impossible', False)
+                is_impossible = qa.get("is_impossible", False)
                 label = 1 if is_impossible else 0
-                if (v2_on and label < 1 and len(answers) < 1) or ((not v2_on) and len(answers) < 1):
+                if (v2_on and label < 1 and len(answers) < 1) or (
+                    (not v2_on) and len(answers) < 1
+                ):
                     # detect inconsistent data
                     # * for v2, the row is possible but has no answer
                     # * for v1, all questions should have answer
                     continue
                 if len(answers) > 0:
-                    answer = answers[0]['text']
-                    answer_start = answers[0]['answer_start']
+                    answer = answers[0]["text"]
+                    answer_start = answers[0]["answer_start"]
                     answer_end = answer_start + len(answer)
                 else:
                     # for questions without answers, give a fake answer
-                    #answer = END
-                    #answer_start = len(context) - len(END)
-                    #answer_end = len(context)
-                    answer = ''
+                    # answer = END
+                    # answer_start = len(context) - len(END)
+                    # answer_end = len(context)
+                    answer = ""
                     answer_start = -1
                     answer_end = -1
                 answer = normalize_qa_field(answer, ["\n", "\t", ":::"])
                 context = normalize_qa_field(context, ["\n", "\t"])
                 question = normalize_qa_field(question, ["\n", "\t"])
-                sample = {'uid': uid, 'premise': context, 'hypothesis': question,
-                          'label': "%s:::%s:::%s:::%s" % (answer_start, answer_end, label, answer)}
+                sample = {
+                    "uid": uid,
+                    "premise": context,
+                    "hypothesis": question,
+                    "label": "%s:::%s:::%s:::%s"
+                    % (answer_start, answer_end, label, answer),
+                }
                 rows.append(sample)
     return rows
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='Preprocessing SQUAD data.')
-    parser.add_argument('--root_dir', type=str, default='data')
+    parser = argparse.ArgumentParser(description="Preprocessing SQUAD data.")
+    parser.add_argument("--root_dir", type=str, default="data")
     args = parser.parse_args()
     return args
 
+
 def main(args):
     root = args.root_dir
     assert os.path.exists(root)
 
-    squad_train_path = os.path.join(root, 'squad/train.json')
-    squad_dev_path = os.path.join(root, 'squad/dev.json')
-    squad_v2_train_path = os.path.join(root, 'squad_v2/train.json')
-    squad_v2_dev_path = os.path.join(root, 'squad_v2/dev.json')
+    squad_train_path = os.path.join(root, "squad/train.json")
+    squad_dev_path = os.path.join(root, "squad/dev.json")
+    squad_v2_train_path = os.path.join(root, "squad_v2/train.json")
+    squad_v2_dev_path = os.path.join(root, "squad_v2/dev.json")
 
     squad_train_data = load_data(squad_train_path)
     squad_dev_data = load_data(squad_dev_path, is_train=False)
-    logger.info('Loaded {} squad train samples'.format(len(squad_train_data)))
-    logger.info('Loaded {} squad dev samples'.format(len(squad_dev_data)))
+    logger.info("Loaded {} squad train samples".format(len(squad_train_data)))
+    logger.info("Loaded {} squad dev samples".format(len(squad_dev_data)))
 
     squad_v2_train_data = load_data(squad_v2_train_path, v2_on=True)
     squad_v2_dev_data = load_data(squad_v2_dev_path, is_train=False, v2_on=True)
-    logger.info('Loaded {} squad_v2 train samples'.format(len(squad_v2_train_data)))
-    logger.info('Loaded {} squad_v2 dev samples'.format(len(squad_v2_dev_data)))
+    logger.info("Loaded {} squad_v2 train samples".format(len(squad_v2_train_data)))
+    logger.info("Loaded {} squad_v2 dev samples".format(len(squad_v2_dev_data)))
 
     canonical_data_suffix = "canonical_data"
     canonical_data_root = os.path.join(root, canonical_data_suffix)
     if not os.path.isdir(canonical_data_root):
         os.mkdir(canonical_data_root)
 
-    squad_train_fout = os.path.join(canonical_data_root, 'squad_train.tsv')
-    squad_dev_fout = os.path.join(canonical_data_root, 'squad_dev.tsv')
-    dump_rows(squad_train_data, squad_train_fout, DataFormat.PremiseAndOneHypothesis)
-    dump_rows(squad_dev_data, squad_dev_fout, DataFormat.PremiseAndOneHypothesis)
-    logger.info('done with squad')
-
-    squad_v2_train_fout = os.path.join(canonical_data_root, 'squad-v2_train.tsv')
-    squad_v2_dev_fout = os.path.join(canonical_data_root, 'squad-v2_dev.tsv')
-    dump_rows(squad_v2_train_data, squad_v2_train_fout, DataFormat.PremiseAndOneHypothesis)
-    dump_rows(squad_v2_dev_data, squad_v2_dev_fout, DataFormat.PremiseAndOneHypothesis)
-    logger.info('done with squad_v2')
+    squad_train_fout = os.path.join(canonical_data_root, "squad_train.tsv")
+    squad_dev_fout = os.path.join(canonical_data_root, "squad_dev.tsv")
+    process_data_and_dump_rows(
+        squad_train_data, squad_train_fout, DataFormat.PremiseAndOneHypothesis, True
+    )
+    process_data_and_dump_rows(
+        squad_dev_data, squad_dev_fout, DataFormat.PremiseAndOneHypothesis, True
+    )
+    logger.info("done with squad")
 
+    squad_v2_train_fout = os.path.join(canonical_data_root, "squad-v2_train.tsv")
+    squad_v2_dev_fout = os.path.join(canonical_data_root, "squad-v2_dev.tsv")
+    process_data_and_dump_rows(
+        squad_v2_train_data,
+        squad_v2_train_fout,
+        DataFormat.PremiseAndOneHypothesis,
+        True,
+    )
+    process_data_and_dump_rows(
+        squad_v2_dev_data, squad_v2_dev_fout, DataFormat.PremiseAndOneHypothesis, True
+    )
+    logger.info("done with squad_v2")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/mtdnn/common/tokenization_utils.py b/mtdnn/common/tokenization_utils.py
new file mode 100644
index 0000000..86418bb
--- /dev/null
+++ b/mtdnn/common/tokenization_utils.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright (c) Microsoft. All rights reserved.
+import json
+from typing import Union
+
+import numpy as np
+
+from mtdnn.common.types import DataFormat, TaskType, TaskDefType
+from mtdnn.tasks.config import get_task_obj
+
+
+def load_task_data(
+    file_path_or_processed_data_list: Union[str, list], task_def: TaskDefType
+):
+    """Load data in MT-DNN Format
+
+    Arguments:
+        file_path_or_processed_data_list {Union[str, list]} -- File path or processed rows object
+        task_def {dict} -- Task Definition to be loaded
+
+    Raises:
+        ValueError: Invalid Task requested
+
+    Returns:
+        list -- list of processed data in MT-DNN Format
+    """
+    assert task_def, "[ERROR] - Task Definition cannot be none"
+    data_format = task_def.data_format
+    task_type = task_def.task_type
+    label_dict = task_def.label_vocab
+    if task_type == TaskType.Ranking:
+        assert data_format == DataFormat.PremiseAndMultiHypothesis
+    if isinstance(file_path_or_processed_data_list, str):
+        processed_data = open(file_path_or_processed_data_list, encoding="utf-8")
+    elif isinstance(file_path_or_processed_data_list, list):
+        processed_data = file_path_or_processed_data_list
+
+    rows = []
+    for line in processed_data:
+        fields = line.strip("\n").split("\t")
+        if data_format == DataFormat.PremiseOnly:
+            assert len(fields) == 3
+            row = {"uid": fields[0], "label": fields[1], "premise": fields[2]}
+        elif data_format == DataFormat.PremiseAndOneHypothesis:
+            assert len(fields) == 4
+            row = {
+                "uid": fields[0],
+                "label": fields[1],
+                "premise": fields[2],
+                "hypothesis": fields[3],
+            }
+        elif data_format == DataFormat.PremiseAndMultiHypothesis:
+            assert len(fields) > 5
+            row = {
+                "uid": fields[0],
+                "ruid": fields[1].split(","),
+                "label": fields[2],
+                "premise": fields[3],
+                "hypothesis": fields[4:],
+            }
+        elif data_format == DataFormat.Sequence:
+            row = {
+                "uid": fields[0],
+                "label": eval(fields[1]),
+                "premise": eval(fields[2]),
+            }
+
+        elif data_format == DataFormat.MRC:
+            row = {
+                "uid": fields[0],
+                "label": fields[1],
+                "premise": fields[2],
+                "hypothesis": fields[3],
+            }
+        else:
+            raise ValueError(data_format)
+
+        task_obj = get_task_obj(task_def)
+        if task_obj is not None:
+            row["label"] = task_obj.input_parse_label(row["label"])
+        elif task_type == TaskType.Ranking:
+            labels = row["label"].split(",")
+            if label_dict is not None:
+                labels = [label_dict[label] for label in labels]
+            else:
+                labels = [float(label) for label in labels]
+            row["label"] = int(np.argmax(labels))
+            row["olabel"] = labels
+        elif task_type == TaskType.Span:
+            pass  # don't process row label
+        elif task_type == TaskType.SequenceLabeling:
+            assert type(row["label"]) is list
+            row["label"] = [label_dict[label] for label in row["label"]]
+
+        rows.append(row)
+    return rows
+
+
+def load_score_file(score_path: str = "", n_class: int = 1):
+    sample_id_2_pred_score_seg_dic = {}
+    score_obj = json.loads(open(score_path, encoding="utf-8").read())
+    assert (len(score_obj["scores"]) % len(score_obj["uids"]) == 0) and (
+        len(score_obj["scores"]) / len(score_obj["uids"]) == n_class
+    ), "[ERROR] - scores column size should equal to sample count or multiple of sample count (for classification problem)"
+
+    scores = score_obj["scores"]
+    score_segs = [
+        scores[i * n_class : (i + 1) * n_class] for i in range(len(score_obj["uids"]))
+    ]
+    for sample_id, pred, score_seg in zip(
+        score_obj["uids"], score_obj["predictions"], score_segs
+    ):
+        sample_id_2_pred_score_seg_dic[sample_id] = (pred, score_seg)
+    return sample_id_2_pred_score_seg_dic
diff --git a/mtdnn/common/types.py b/mtdnn/common/types.py
index 9e802a2..45fef5c 100644
--- a/mtdnn/common/types.py
+++ b/mtdnn/common/types.py
@@ -9,15 +9,61 @@ class TaskType(IntEnum):
     Ranking = 3
     Span = 4
     SequenceLabeling = 5
+    MaskLM = 6
 
 
 class DataFormat(IntEnum):
+    Init = 0
     PremiseOnly = 1
     PremiseAndOneHypothesis = 2
     PremiseAndMultiHypothesis = 3
-    Sequence = 4
+    MRC = 4
+    Sequence = 5
+    MLM = 6
 
 
 class EncoderModelType(IntEnum):
     BERT = 1
     ROBERTA = 2
+    XLNET = 3
+    SAN = 4
+
+
+class TaskDefType(dict):
+    def __init__(
+        self,
+        label_vocab,
+        n_class,
+        data_format,
+        task_type,
+        metric_meta,
+        split_names,
+        enable_san,
+        dropout_p,
+        loss,
+        kd_loss,
+        data_paths,
+    ):
+        """
+            :param label_vocab: map string label to numbers.
+                only valid for Classification task or ranking task.
+                For ranking task, better label should have large number
+        """
+        super().__init__(
+            **{k: repr(v) for k, v in locals().items()}
+        )  # ensure the class is JSON serializable
+        self.label_vocab = label_vocab
+        self.n_class = n_class
+        self.data_format = data_format
+        self.task_type = task_type
+        self.metric_meta = metric_meta
+        self.split_names = split_names
+        self.enable_san = enable_san
+        self.dropout_p = dropout_p
+        self.loss = loss
+        self.kd_loss = kd_loss
+        self.data_paths = data_paths
+
+    @classmethod
+    def from_dict(cls, dict_rep):
+        return cls(**dict_rep)
diff --git a/mtdnn/common/utils.py b/mtdnn/common/utils.py
index 02d765b..624f907 100644
--- a/mtdnn/common/utils.py
+++ b/mtdnn/common/utils.py
@@ -5,16 +5,20 @@
 import logging
 import math
 import os
+import random
 import subprocess
+import sys
 import tarfile
 import zipfile
 from contextlib import contextmanager
 from logging import Logger
 from tempfile import TemporaryDirectory
 
+import numpy
 import requests
 import torch
 from tqdm import tqdm
+from time import gmtime, strftime
 
 
 class MTDNNCommonUtils:
@@ -67,23 +71,32 @@ def generate_decoder_opt(enable_san, max_opt):
         return opt_v
 
     @staticmethod
-    def setup_logging(filename="run.log", mode="w") -> Logger:
-        logger = logging.getLogger(__name__)
-        log_file_handler = logging.FileHandler(
-            filename=filename, mode=mode, encoding="utf-8"
+    def create_logger(name, silent=False, to_disk=False, log_file="run.log"):
+        """ Logger wrapper """
+        # setup logger
+        log = logging.getLogger(name)
+        log.setLevel(logging.DEBUG)
+        log.propagate = False
+        formatter = logging.Formatter(
+            fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+            datefmt="%m/%d/%Y %I:%M:%S",
         )
-        log_formatter = logging.Formatter(
-            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-        )
-        log_file_handler.setFormatter(log_formatter)
-        do_add_handler = True
-        for handler in logger.handlers:
-            if isinstance(handler, logging.FileHandler):
-                do_add_handler = False
-        if do_add_handler:
-            logger.addHandler(log_file_handler)
-        logger.setLevel(logging.DEBUG)
-        return logger
+        if not silent:
+            ch = logging.StreamHandler(sys.stdout)
+            ch.setLevel(logging.INFO)
+            ch.setFormatter(formatter)
+            log.addHandler(ch)
+        if to_disk:
+            log_file = (
+                log_file
+                if log_file is not None
+                else strftime("%Y-%m-%d-%H-%M-%S.log", gmtime())
+            )
+            fh = logging.FileHandler(log_file)
+            fh.setLevel(logging.DEBUG)
+            fh.setFormatter(formatter)
+            log.addHandler(fh)
+        return log
 
     @staticmethod
     def create_directory_if_not_exists(dir_path: str):
@@ -104,7 +117,9 @@ def download_path(path=None):
             tmp_dir.cleanup()
 
     @staticmethod
-    def maybe_download(url, filename=None, work_directory=".", expected_bytes=None):
+    def maybe_download(
+        url, filename=None, work_directory=".", expected_bytes=None, log: Logger = None
+    ):
         """Download a file if it is not already downloaded.
 
         Args:
diff --git a/mtdnn/common/vocab.py b/mtdnn/common/vocab.py
index 2d74cd9..1e4c3a0 100644
--- a/mtdnn/common/vocab.py
+++ b/mtdnn/common/vocab.py
@@ -2,23 +2,25 @@
 import tqdm
 import unicodedata
 
-PAD = 'PADPAD'
-UNK = 'UNKUNK'
-STA= 'BOSBOS'
-END = 'EOSEOS'
+PAD = "PADPAD"
+UNK = "UNKUNK"
+STA = "BOSBOS"
+END = "EOSEOS"
 
 PAD_ID = 0
 UNK_ID = 1
 STA_ID = 2
 END_ID = 3
 
+
 class Vocabulary(object):
     INIT_LEN = 4
+
     def __init__(self, neat=False):
         self.neat = neat
         if not neat:
             self.tok2ind = {PAD: PAD_ID, UNK: UNK_ID, STA: STA_ID, END: END_ID}
-            self.ind2tok = {PAD_ID: PAD, UNK_ID: UNK, STA_ID: STA, END_ID:END}
+            self.ind2tok = {PAD_ID: PAD, UNK_ID: UNK, STA_ID: STA, END_ID: END}
         else:
             self.tok2ind = {}
             self.ind2tok = {}
@@ -37,9 +39,15 @@ def __contains__(self, key):
 
     def __getitem__(self, key):
         if type(key) == int:
-            return self.ind2tok.get(key, -1) if self.neat else self.ind2tok.get(key, UNK)
+            return (
+                self.ind2tok.get(key, -1) if self.neat else self.ind2tok.get(key, UNK)
+            )
         if type(key) == str:
-            return self.tok2ind.get(key, None) if self.neat else self.tok2ind.get(key,self.tok2ind.get(UNK))
+            return (
+                self.tok2ind.get(key, None)
+                if self.neat
+                else self.tok2ind.get(key, self.tok2ind.get(UNK))
+            )
 
     def __setitem__(self, key, item):
         if type(key) == int and type(item) == str:
@@ -47,7 +55,7 @@ def __setitem__(self, key, item):
         elif type(key) == str and type(item) == int:
             self.tok2ind[key] = item
         else:
-            raise RuntimeError('Invalid (key, item) types.')
+            raise RuntimeError("Invalid (key, item) types.")
 
     def add(self, token):
         if token not in self.tok2ind:
@@ -59,8 +67,7 @@ def get_vocab_list(self, with_order=True):
         if with_order:
             words = [self[k] for k in range(0, len(self))]
         else:
-            words = [k for k in self.tok2ind.keys()
-                      if k not in {PAD, UNK, STA, END}]
+            words = [k for k in self.tok2ind.keys() if k not in {PAD, UNK, STA, END}]
         return words
 
     def toidx(self, tokens):
@@ -74,7 +81,8 @@ def copy(self):
             new_vocab.add(w)
         return new_vocab
 
-    def build(words, neat=False):
+    def build(self, words, neat=False):
         vocab = Vocabulary(neat)
-        for w in words: vocab.add(w)
+        for w in words:
+            vocab.add(w)
         return vocab
diff --git a/mtdnn/data_builder_mtdnn.py b/mtdnn/data_builder_mtdnn.py
new file mode 100644
index 0000000..11c3b09
--- /dev/null
+++ b/mtdnn/data_builder_mtdnn.py
@@ -0,0 +1,444 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+import json
+import os
+from collections import ChainMap, defaultdict
+from typing import Dict, List
+
+from tqdm import tqdm
+
+from mtdnn.common import squad_utils
+from mtdnn.common.tokenization_utils import load_task_data
+from mtdnn.common.types import DataFormat, EncoderModelType
+from mtdnn.common.utils import MTDNNCommonUtils
+from mtdnn.tasks.config import MTDNNTaskDefs
+from mtdnn.tasks.utils import (
+    load_cola,
+    load_conll_chunk,
+    load_conll_ner,
+    load_conll_pos,
+    load_mnli,
+    load_mrpc,
+    load_qnli,
+    load_qqp,
+    load_rte,
+    load_scitail,
+    load_snli,
+    load_sst,
+    load_stsb,
+    load_wnli,
+    process_data_and_dump_rows,
+)
+from mtdnn.tokenizer_mtdnn import MTDNNTokenizer
+
+
+logger = MTDNNCommonUtils.create_logger(__name__, to_disk=True)
+
+# Map of supported tasks
+GLUE_SUPPORTED_TASKS_LOADER_MAP = {
+    "cola": load_cola,
+    "mnli": load_mnli,
+    "mrpc": load_mrpc,
+    "qnli": load_qnli,
+    "qqp": load_qqp,
+    "rte": load_rte,
+    "scitail": load_scitail,
+    "snli": load_snli,
+    "sst": load_sst,
+    "stsb": load_stsb,
+    "wnli": load_wnli,
+}
+NER_SUPPORTED_TASKS_LOADER_MAP = {
+    "ner": load_conll_ner,
+    "pos": load_conll_pos,
+    "chunk": load_conll_chunk,
+}
+SUPPORTED_TASKS_LOADER_MAP = ChainMap(
+    GLUE_SUPPORTED_TASKS_LOADER_MAP, NER_SUPPORTED_TASKS_LOADER_MAP
+)
+
+
+class MTDNNTaskDataFileLoader:
+    supported_tasks_loader_map = SUPPORTED_TASKS_LOADER_MAP
+
+    def __init__(
+        self,
+        task_defs: MTDNNTaskDefs,
+        data_dir: str = "data",
+        canonical_data_suffix: str = "canonical_data",
+    ):
+        self.data_dir = data_dir
+        self.task_defs = task_defs
+        self.canonical_data_dir = os.path.join(self.data_dir, canonical_data_suffix)
+        if not os.path.isdir(self.canonical_data_dir):
+            os.makedirs(self.canonical_data_dir)
+
+    def load_and_build_data(self, dump_rows: bool = False) -> dict:
+        """
+        Load and build out the GLUE and NER Tasks. 
+        
+        Data Options format in Task Definitions will look like:
+            data_opts_map[name] = {
+                "data_paths": ['train', 'test', 'dev1', 'dev2']",
+                "data_opts": {
+                    "header": task.header or True,
+                    "is_train": task.is_train or True,
+                    "multi_snli": task.multi_snli or False,
+                },
+            }
+
+        Keyword Arguments:
+            dump_rows {bool} -- Dump processed rows to disk after processing (default: {False})
+
+        Raises:
+            IOError: IO Error from reading the files from disk
+
+        Returns:
+            dict -- Dictionary of processed data rows with key as task name
+        """
+
+        datasets_map: dict = self.task_defs.data_paths_map
+        processed_data = defaultdict(lambda: [])
+        # For each task, load file and build out data
+
+        for name, params in datasets_map.items():
+
+            # TODO - standardize parameters for all loaders to use opts
+            data_opts: dict = params.get("data_opts", None)
+            assert data_opts, "[ERROR] - Data opts cannot be None"
+
+            # For each task, we process the provided data files into MT-DNN format
+            # Format of input is of the form MNLI/{train.tsv, dev_matched.tsv, dev_mismatched.tsv, ...}
+            for path in params["data_paths"]:
+                in_file_path = os.path.join(self.data_dir, path)
+                in_file = os.path.split(path)[-1]
+                out_file_name = f"{name}_{in_file}"
+                out_file_path = os.path.join(self.canonical_data_dir, out_file_name)
+
+                ####################################################################
+                #                  HANDLE SPECIAL CASES                            #
+                ####################################################################
+
+                # Set data processing option for test data
+                if "test" in in_file:
+                    data_opts["is_train"] = False
+
+                # No header for cola train and dev data
+                if name == "cola" and ("train" in in_file or "dev" in in_file):
+                    data_opts["header"] = False
+                try:
+                    # Load and dump file
+                    task_load_func = self.supported_tasks_loader_map[name]
+                    data = task_load_func(in_file_path, data_opts)
+                    processed_rows = process_data_and_dump_rows(
+                        rows=data,
+                        out_path=out_file_path,
+                        data_format=self.task_defs.data_type_map[name],
+                        dump_rows=dump_rows,
+                    )
+                    # Format - cola_dev: [processed_rows]
+                    processed_data.update(
+                        {os.path.splitext(out_file_name)[0]: processed_rows}
+                    )
+                    logger.info(
+                        f"Sucessfully loaded and built {len(data)} samples for {name} at {out_file_path}"
+                    )
+                except Exception as ex:
+                    raise IOError(ex)
+        return processed_data
+
+
+class MTDNNDataBuilder:
+
+    DEBUG_MODE = False
+    MAX_SEQ_LEN = 512
+    DOC_STRIDE = 180
+    MAX_QUERY_LEN = 64
+    MRC_MAX_SEQ_LEN = 384
+
+    def __init__(
+        self,
+        tokenizer: MTDNNTokenizer = None,
+        task_defs: MTDNNTaskDefs = None,
+        do_lower_case: bool = False,
+        data_dir: str = "data",
+        canonical_data_suffix: str = "canonical_data",
+        dump_rows: bool = False,
+    ):
+        assert tokenizer, "[ERROR] - MTDNN Tokenizer is required"
+        assert task_defs, "[ERROR] - MTDNN Task Definition is required"
+        self.tokenizer = tokenizer
+        self.task_defs = task_defs
+        self.save_to_file = dump_rows
+        self.model_name = (
+            self.tokenizer.get_model_name()
+        )  # ensure model name is same as tokenizer
+        self.literal_model_name = self.model_name.split("-")[0]
+        self.model_type = EncoderModelType[
+            self.literal_model_name.upper()
+        ]  # BERT = 1, ROBERTA = 2ll
+        mt_dnn_model_name_fmt = self.model_name.replace(
+            "-", "_"
+        )  # format to mt-dnn format
+        self.mt_dnn_suffix = (
+            f"{mt_dnn_model_name_fmt}_lower"
+            if do_lower_case
+            else f"{mt_dnn_model_name_fmt}"
+        )
+        self.canonical_data_dir: str = f"{data_dir}/{canonical_data_suffix}"
+        self.mt_dnn_root = os.path.join(self.canonical_data_dir, self.mt_dnn_suffix)
+        if not os.path.isdir(self.mt_dnn_root):
+            os.makedirs(self.mt_dnn_root)
+
+        # Load and process data
+        self.task_data_loader = MTDNNTaskDataFileLoader(
+            self.task_defs, data_dir, canonical_data_suffix,
+        )
+        self.processed_tasks_data = self.task_data_loader.load_and_build_data(
+            self.save_to_file
+        )
+
+    def build_data_premise_only(
+        self, data: List, max_seq_len: int = 0,
+    ):
+        """ Build data of single sentence tasks """
+        max_seq_len = max_seq_len if max_seq_len else self.MAX_SEQ_LEN
+        rows = []
+        for idx, sample in tqdm(enumerate(data), desc="Building Data For Premise Only"):
+            ids = sample["uid"]
+            premise = sample["premise"]
+            label = sample["label"]
+            if len(premise) > self.MAX_SEQ_LEN - 2:
+                premise = premise[: self.MAX_SEQ_LEN - 2]
+            input_ids, input_mask, type_ids = self.tokenizer.encode(
+                text=premise, max_length=self.MAX_SEQ_LEN,
+            )
+            features = {
+                "uid": ids,
+                "label": label,
+                "token_id": input_ids,
+                "type_id": type_ids,
+            }
+            rows.append(features)
+        return rows
+
+    def build_data_premise_and_one_hypo(
+        self, data: List, max_seq_len: int = 0,
+    ):
+        """ Build data of sentence pair tasks """
+        max_seq_len = max_seq_len if max_seq_len else self.MAX_SEQ_LEN
+        rows = []
+        for idx, sample in tqdm(
+            enumerate(data), desc="Building Data For Premise and One Hypothesis"
+        ):
+            ids = sample["uid"]
+            premise = sample["premise"]
+            hypothesis = sample["hypothesis"]
+            label = sample["label"]
+            input_ids, input_mask, type_ids = self.tokenizer.encode(
+                text=premise, text_pair=hypothesis, max_length=max_seq_len,
+            )
+            features = {
+                "uid": ids,
+                "label": label,
+                "token_id": input_ids,
+                "type_id": type_ids,
+            }
+            rows.append(features)
+        return rows
+
+    def build_data_premise_and_multi_hypo(
+        self, data: List, max_seq_len: int = 0,
+    ):
+        """ Build QNLI as a pair-wise ranking task """
+        max_seq_len = max_seq_len if max_seq_len else self.MAX_SEQ_LEN
+        rows = []
+        for idx, sample in tqdm(
+            enumerate(data), desc="Building Data For Premise and Multi Hypothesis"
+        ):
+            ids = sample["uid"]
+            premise = sample["premise"]
+            hypothesis_list = sample["hypothesis"]
+            label = sample["label"]
+            input_ids_list = []
+            type_ids_list = []
+            for hypothesis in hypothesis_list:
+                input_ids, mask, type_ids = self.tokenizer.encode(
+                    text=premise, text_pair=hypothesis, max_length=max_seq_len,
+                )
+                input_ids_list.append(input_ids)
+                type_ids_list.append(type_ids)
+            features = {
+                "uid": ids,
+                "label": label,
+                "token_id": input_ids_list,
+                "type_id": type_ids_list,
+                "ruid": sample["ruid"],
+                "olabel": sample["olabel"],
+            }
+            rows.append(features)
+        return rows
+
+    def build_data_sequence(
+        self, data: List, max_seq_len: int = 0, label_mapper: Dict = None,
+    ):
+        max_seq_len = max_seq_len if max_seq_len else self.MAX_SEQ_LEN
+        rows = []
+        for idx, sample in tqdm(enumerate(data), desc="Building Data For Sequence"):
+            ids = sample["uid"]
+            premise = sample["premise"]
+            tokens = []
+            labels = []
+            for i, word in tqdm(enumerate(premise), desc="Building Sequence Premise"):
+                subwords = tokenizer.tokenize(word)
+                tokens.extend(subwords)
+                for j in range(len(subwords)):
+                    if j == 0:
+                        labels.append(sample["label"][i])
+                    else:
+                        labels.append(label_mapper["X"])
+            if len(premise) > max_seq_len - 2:
+                tokens = tokens[: max_seq_len - 2]
+                labels = labels[: max_seq_len - 2]
+
+            label = [label_mapper["CLS"]] + labels + [label_mapper["SEP"]]
+            input_ids = tokenizer.convert_tokens_to_ids(
+                [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
+            )
+            assert len(label) == len(input_ids)
+            type_ids = [0] * len(input_ids)
+            features = {
+                "uid": ids,
+                "label": label,
+                "token_id": input_ids,
+                "type_id": type_ids,
+            }
+            rows.append(features)
+        return rows
+
+    def build_data_mrc(
+        self,
+        data: List,
+        max_seq_len: int = 0,
+        label_mapper: Dict = None,
+        is_training: bool = True,
+    ):
+        max_seq_len = max_seq_len if max_seq_len else self.MAX_SEQ_LEN
+        rows = []
+        unique_id = 1000000000  # TODO: this is from BERT, needed to remove it...
+        for example_index, sample in tqdm(
+            enumerate(data), desc="Building Data For MRC"
+        ):
+            ids = sample["uid"]
+            doc = sample["premise"]
+            query = sample["hypothesis"]
+            label = sample["label"]
+            doc_tokens, cw_map = squad_utils.token_doc(doc)
+            (
+                answer_start,
+                answer_end,
+                answer,
+                is_impossible,
+            ) = squad_utils.parse_squad_label(label)
+            answer_start_adjusted, answer_end_adjusted = squad_utils.recompute_span(
+                answer, answer_start, cw_map
+            )
+            is_valid = squad_utils.is_valid_answer(
+                doc_tokens, answer_start_adjusted, answer_end_adjusted, answer
+            )
+            if not is_valid:
+                continue
+            """
+            TODO --xiaodl: support RoBERTa
+            """
+            feature_list = squad_utils.mrc_feature(
+                self.tokenizer,
+                unique_id,
+                example_index,
+                query,
+                doc_tokens,
+                answer_start_adjusted,
+                answer_end_adjusted,
+                is_impossible,
+                max_seq_len,
+                self.MAX_QUERY_LEN,
+                self.DOC_STRIDE,
+                answer_text=answer,
+                is_training=True,
+            )
+            unique_id += len(feature_list)
+            for feature in feature_list:
+                feature_obj = {
+                    "uid": ids,
+                    "token_id": feature.input_ids,
+                    "mask": feature.input_mask,
+                    "type_id": feature.segment_ids,
+                    "example_index": feature.example_index,
+                    "doc_span_index": feature.doc_span_index,
+                    "tokens": feature.tokens,
+                    "token_to_orig_map": feature.token_to_orig_map,
+                    "token_is_max_context": feature.token_is_max_context,
+                    "start_position": feature.start_position,
+                    "end_position": feature.end_position,
+                    "label": feature.is_impossible,
+                    "doc": doc,
+                    "doc_offset": feature.doc_offset,
+                    "answer": [answer],
+                }
+                rows.append(feature_obj)
+        return rows
+
+    def _build_data_from_format(
+        self,
+        data: List,
+        dump_path: str = "",
+        data_format: int = DataFormat.Init,
+        max_seq_len: int = 0,
+        label_mapper: Dict = None,
+        dump_rows: bool = False,
+    ):
+        max_seq_len = max_seq_len if max_seq_len else self.MAX_SEQ_LEN
+        rows = None
+
+        # Process the data depending on the data format set from the config
+        if data_format == DataFormat.PremiseOnly:
+            rows = self.build_data_premise_only(data, max_seq_len)
+        elif data_format == DataFormat.PremiseAndOneHypothesis:
+            rows = self.build_data_premise_and_one_hypo(data, max_seq_len)
+        elif data_format == DataFormat.PremiseAndMultiHypothesis:
+            rows = self.build_data_premise_and_multi_hypo(data, max_seq_len)
+        elif data_format == DataFormat.Sequence:
+            rows = self.build_data_sequence(data, max_seq_len, label_mapper)
+        elif data_format == DataFormat.MRC:
+            rows = self.build_data_mrc(data, max_seq_len)
+        else:
+            raise ValueError(data_format)
+
+        # Save file to disk
+        if self.save_to_file:
+            with open(dump_path, "w", encoding="utf-8") as writer:
+                logger.info(f"Saving data to {dump_path}")
+                for row in tqdm(rows, desc=f"Saving Data For {data_format.name}"):
+                    writer.write(f"{json.dumps(row)}\n")
+        return rows
+
+    def vectorize(self):
+        """ Tokenize and build data for the tasks """
+        mtdnn_featurized_data = {}
+        for task_split_name, task_data in self.processed_tasks_data.items():
+            print(task_split_name)
+            split_name = task_split_name.split("_")
+            task = split_name[0]
+            task_def = self.task_defs.get_task_def(task)
+            dump_path = f"{os.path.join(self.mt_dnn_root, task_split_name)}.json"
+            logger.info(f"Building Data For '{' '.join(split_name).upper()}' Task")
+            loaded_data = load_task_data(task_data, task_def)
+            rows = self._build_data_from_format(
+                data=loaded_data,
+                dump_path=dump_path,
+                data_format=task_def.data_format,
+                label_mapper=task_def.label_vocab,
+                dump_rows=self.save_to_file,
+            )
+            mtdnn_featurized_data.update({task_split_name: rows})
+        return mtdnn_featurized_data
diff --git a/mtdnn/dataset_mtdnn.py b/mtdnn/dataset_mtdnn.py
index 1065a53..77e5b91 100644
--- a/mtdnn/dataset_mtdnn.py
+++ b/mtdnn/dataset_mtdnn.py
@@ -5,15 +5,22 @@
 import sys
 from shutil import copyfile
 
+import numpy as np
 import torch
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from torch.utils.data import BatchSampler, DataLoader, Dataset
 
 from mtdnn.common.types import DataFormat, EncoderModelType, TaskType
+from mtdnn.common.utils import MTDNNCommonUtils
+from mtdnn.tasks.mlm_utils import create_instances_from_document, load_loose_json
 
 UNK_ID = 100
 BOS_ID = 101
 
 
+logger = MTDNNCommonUtils.create_logger(__name__, to_disk=True)
+
+
 class MTDNNMultiTaskBatchSampler(BatchSampler):
     def __init__(self, datasets, batch_size, mix_opt, extra_task_ratio):
         self._datasets = datasets
@@ -103,67 +110,176 @@ def __getitem__(self, idx):
 class MTDNNSingleTaskDataset(Dataset):
     def __init__(
         self,
-        path,
-        is_train=True,
-        maxlen=128,
-        factor=1.0,
-        task_id=0,
-        task_type=TaskType.Classification,
-        data_type=DataFormat.PremiseOnly,
+        vectorized_data: list,
+        is_train: bool = True,
+        maxlen: int = 512,
+        factor: float = 1.0,
+        task_id: int = 0,
+        task_type: int = TaskType.Classification,
+        data_type: int = DataFormat.PremiseOnly,
+        bert_model: str = "bert-base-uncased",
+        do_lower_case: bool = True,
+        masked_lm_prob: float = 0.15,
+        seed: int = 13,
+        short_seq_prob: float = 0.1,
+        max_seq_length: int = 512,
+        max_predictions_per_seq: int = 80,
     ):
-        self._data = self.load(path, is_train, maxlen, factor, task_type)
+        self._data, self._tokenizer = self.load_vectorized_data(
+            vectorized_data,
+            is_train,
+            maxlen,
+            factor,
+            task_type,
+            bert_model,
+            do_lower_case,
+        )
         self._task_id = task_id
         self._task_type = task_type
         self._data_type = data_type
 
+        # below is for MLM
+        if self._task_type is TaskType.MaskLM:
+            assert self._tokenizer, "[ERROR] - Tokenizer cannot be None"
+
+        # initialize vocab words
+        self._vocab_words = (
+            list(self._tokenizer.vocab.keys()) if self._tokenizer else None
+        )
+
+        self._masked_lm_prob = masked_lm_prob
+        self._seed = seed
+        self._short_seq_prob = short_seq_prob
+        self._max_seq_length = max_seq_length
+        self._max_predictions_per_seq = max_predictions_per_seq
+        self._rng = random.Random(seed)
+
     def get_task_id(self):
         return self._task_id
 
     @staticmethod
-    def load(path, is_train=True, maxlen=128, factor=1.0, task_type=None):
+    def load_vectorized_data(
+        vectorized_data,
+        is_train=True,
+        maxlen=128,
+        factor=1.0,
+        task_type=None,
+        bert_model="bert-base-uncased",
+        do_lower_case=True,
+        masked_lm_prob=0.15,
+        seed=13,
+        short_seq_prob=0.1,
+        max_seq_length=512,
+        max_predictions_per_seq=80,
+    ):
+        """Load Vectorized Data
+
+            Arguments:
+                vectorized_data {[type]} -- [description]
+                
+            Keyword Arguments:
+                is_train {bool} -- If Data is training (default: {True})
+                maxlen {int} -- Maximum length (default: {128})
+                factor {float} -- Sequence Factor (default: {1.0})
+                task_type {[type]} -- ask type to train (default: {None})
+                bert_model {str} -- Model checkpoint (default: {"bert-base-uncased"})
+                do_lower_case {bool} -- Use model checkpoint lower case version (default: {True})
+                masked_lm_prob {float} -- [description] (default: {0.15})
+                seed {int} -- seed value (default: {13})
+                short_seq_prob {float} -- Short sequence probability (default: {0.1})
+                max_seq_length {int} -- Maximum sequene length (default: {512})
+                max_predictions_per_seq {int} -- Max predicitons per sequence (default: {80})
+
+            Returns:
+                list  -- Vectorized data
+            """
+
         assert task_type is not None
-        with open(path, "r", encoding="utf-8") as reader:
-            data = []
-            cnt = 0
-            for line in reader:
-                sample = json.loads(line)
-                sample["factor"] = factor
-                cnt += 1
-                if is_train:
-                    if (task_type == TaskType.Ranking) and (
-                        len(sample["token_id"][0]) > maxlen
-                        or len(sample["token_id"][1]) > maxlen
-                    ):
-                        continue
-                    if (task_type != TaskType.Ranking) and (
-                        len(sample["token_id"]) > maxlen
-                    ):
-                        continue
-                data.append(sample)
-            print("Loaded {} samples out of {}".format(len(data), cnt))
-        return data
+        data = []
+
+        if task_type == TaskType.MaskLM:
+            tokenizer = BertTokenizer.from_pretrained(
+                bert_model, do_lower_case=do_lower_case
+            )
+            vocab_words = list(tokenizer.vocab.keys())
+            for doc in load_loose_json(vectorized_data):
+                paras = doc["text"].split("\n\n")
+                paras = [para.strip() for para in paras if len(para.strip()) > 0]
+                tokens = [tokenizer.tokenize(para) for para in paras]
+                data.append(tokens)
+            return data, tokenizer
+
+        cnt = 0
+        for sample in vectorized_data:
+            # sample = json.loads(line)
+            sample["factor"] = factor
+            cnt += 1
+            if is_train:
+                if (task_type == TaskType.Ranking) and (
+                    len(sample["token_id"][0]) > maxlen
+                    or len(sample["token_id"][1]) > maxlen
+                ):
+                    continue
+                if (task_type != TaskType.Ranking) and (
+                    len(sample["token_id"]) > maxlen
+                ):
+                    continue
+            data.append(sample)
+        logger.info(f"Loaded {len(data)} samples out of {cnt}")
+        return data, None
 
     def __len__(self):
         return len(self._data)
 
     def __getitem__(self, idx):
-        return {
-            "task": {
-                "task_id": self._task_id,
-                "task_type": self._task_type,
-                "data_type": self._data_type,
-            },
-            "sample": self._data[idx],
-        }
+        if self._task_type == TaskType.MaskLM:
+            # create a MLM instance
+            instances = create_instances_from_document(
+                self._data,
+                idx,
+                self._max_seq_length,
+                self._short_seq_prob,
+                self._masked_lm_prob,
+                self._max_predictions_per_seq,
+                self._vocab_words,
+                self._rng,
+            )
+            instance_ids = list(range(0, len(instances)))
+            choice = np.random.choice(instance_ids, 1)[0]
+            instance = instances[choice]
+            labels = self._tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+            position = instance.masked_lm_positions
+            labels = [lab if idx in position else -1 for idx, lab in enumerate(labels)]
+            sample = {
+                "token_id": self._tokenizer.convert_tokens_to_ids(instance.tokens),
+                "type_id": instance.segment_ids,
+                "nsp_lab": 1 if instance.is_random_next else 0,
+                "position": instance.masked_lm_positions,
+                "label": labels,
+                "uid": idx,
+            }
+            return {
+                "task": {"task_id": self._task_id, "task_def": self._task_def},
+                "sample": sample,
+            }
+        else:
+            return {
+                "task": {
+                    "task_id": self._task_id,
+                    "task_type": self._task_type,
+                    "data_type": self._data_type,
+                },
+                "sample": self._data[idx],
+            }
 
 
 class MTDNNCollater:
     def __init__(
         self,
-        is_train=True,
-        dropout_w=0.005,
-        soft_label=False,
-        encoder_type=EncoderModelType.BERT,
+        is_train: bool = True,
+        dropout_w: float = 0.005,
+        soft_label: bool = False,
+        encoder_type: int = EncoderModelType.BERT,
     ):
         self.is_train = is_train
         self.dropout_w = dropout_w
@@ -271,7 +387,7 @@ def collate_fn(self, batch):
                 # unify to one type of label
                 batch_info["label"] = len(batch_data) - 1
                 # batch_data.extend([torch.LongTensor(start), torch.LongTensor(end)])
-            elif task_type == TaskType.SeqenceLabeling:
+            elif task_type == TaskType.SequenceLabeling:
                 batch_size = self._get_batch_size(batch)
                 tok_len = self._get_max_len(batch, key="token_id")
                 tlab = torch.LongTensor(batch_size, tok_len).fill_(-1)
@@ -280,9 +396,19 @@ def collate_fn(self, batch):
                     tlab[i, :ll] = torch.LongTensor(label)
                 batch_data.append(tlab)
                 batch_info["label"] = len(batch_data) - 1
+            elif task_type == TaskType.MaskLM:
+                batch_size = self._get_batch_size(batch)
+                tok_len = self._get_max_len(batch, key="token_id")
+                tlab = torch.LongTensor(batch_size, tok_len).fill_(-1)
+                for i, label in enumerate(labels):
+                    ll = len(label)
+                    tlab[i, :ll] = torch.LongTensor(label)
+                labels = torch.LongTensor([sample["nsp_lab"] for sample in batch])
+                batch_data.append((tlab, labels))
+                batch_info["label"] = len(batch_data) - 1
 
             # soft label generated by ensemble models for knowledge distillation
-            if self.soft_label_on and (batch[0].get("softlabel", None) is not None):
+            if self.soft_label_on and batch[0].get("softlabel", None):
                 assert (
                     task_type != TaskType.Span
                 )  # Span task doesn't support soft label yet.
diff --git a/mtdnn/modeling_mtdnn.py b/mtdnn/modeling_mtdnn.py
index 0c9c12a..802a60f 100644
--- a/mtdnn/modeling_mtdnn.py
+++ b/mtdnn/modeling_mtdnn.py
@@ -15,18 +15,32 @@
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
+from apex import amp
 from fairseq.models.roberta import RobertaModel as FairseqRobertModel
+from pytorch_pretrained_bert import BertAdam as Adam
 from tensorboardX import SummaryWriter
 from torch import nn
 from torch.optim.lr_scheduler import *
 from torch.utils.data import DataLoader
 from transformers import (
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
     BertConfig,
     BertModel,
     BertPreTrainedModel,
+    BertTokenizer,
     PretrainedConfig,
     PreTrainedModel,
+    RobertaConfig,
     RobertaModel,
+    RobertaTokenizer,
+    XLMRobertaConfig,
+    XLMRobertaModel,
+    XLMRobertaTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetTokenizer,
 )
 
 from mtdnn.common.archive_maps import PRETRAINED_MODEL_ARCHIVE_MAP
@@ -36,14 +50,27 @@
 from mtdnn.common.loss import LOSS_REGISTRY
 from mtdnn.common.metrics import calc_metrics
 from mtdnn.common.san import SANBERTNetwork, SANClassifier
+from mtdnn.common.san_model import SanModel
 from mtdnn.common.squad_utils import extract_answer, merge_answers, select_answers
 from mtdnn.common.types import DataFormat, EncoderModelType, TaskType
 from mtdnn.common.utils import MTDNNCommonUtils
 from mtdnn.configuration_mtdnn import MTDNNConfig
 from mtdnn.dataset_mtdnn import MTDNNCollater
 from mtdnn.tasks.config import MTDNNTaskDefs
+from mtdnn.tasks.utils import submit
 
-logger = MTDNNCommonUtils.setup_logging()
+
+logger = MTDNNCommonUtils.create_logger(__name__, to_disk=True)
+
+# Supported Model Classes Map
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertModel, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
+    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
+    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaModel, XLMRobertaTokenizer),
+    "san": (BertConfig, SanModel, BertTokenizer),
+}
 
 
 class MTDNNPretrainedModel(nn.Module):
@@ -163,7 +190,8 @@ def __init__(
             with MTDNNCommonUtils.download_path() as file_path:
                 path = pathlib.Path(file_path)
                 self.local_model_path = MTDNNCommonUtils.maybe_download(
-                    url=self.pretrained_model_archive_map[pretrained_model_name]
+                    url=self.pretrained_model_archive_map[pretrained_model_name],
+                    log=logger,
                 )
             self.bert_model = MTDNNCommonUtils.load_pytorch_model(self.local_model_path)
             self.state_dict = self.bert_model["state"]
@@ -306,8 +334,6 @@ def _setup_optim(
 
         if self.config.fp16:
             try:
-                from apex import amp
-
                 global amp
             except ImportError:
                 raise ImportError(
diff --git a/mtdnn/process_mtdnn.py b/mtdnn/process_mtdnn.py
index e7205bd..f3b8c07 100644
--- a/mtdnn/process_mtdnn.py
+++ b/mtdnn/process_mtdnn.py
@@ -9,7 +9,6 @@
 from tensorboardX import SummaryWriter
 from torch.utils.data import BatchSampler, DataLoader, Dataset
 
-from mtdnn.common.glue.glue_utils import submit
 from mtdnn.common.types import TaskType
 from mtdnn.common.utils import MTDNNCommonUtils
 from mtdnn.configuration_mtdnn import MTDNNConfig
@@ -21,8 +20,9 @@
 )
 from mtdnn.modeling_mtdnn import MTDNNModel
 from mtdnn.tasks.config import MTDNNTaskDefs
+from mtdnn.tasks.utils import submit
 
-logger = MTDNNCommonUtils.setup_logging(mode="w")
+logger = MTDNNCommonUtils.create_logger(__name__, to_disk=True)
 
 
 class MTDNNDataProcess:
@@ -30,21 +30,25 @@ def __init__(
         self,
         config: MTDNNConfig,
         task_defs: MTDNNTaskDefs,
-        data_dir: str,
-        train_datasets_list: list = ["mnli"],
-        test_datasets_list: list = ["mnli_mismatched,mnli_matched"],
+        vectorized_data: dict,
         glue_format: bool = False,
         data_sort: bool = False,
     ):
-        assert len(train_datasets_list) >= 1, "Train dataset list cannot be empty"
-        assert len(test_datasets_list) >= 1, "Test dataset list cannot be empty"
+        assert vectorized_data, "[ERROR] - Vectorized data cannot be None"
 
         # Initialize class members
         self.config = config
         self.task_defs = task_defs
-        self.train_datasets = train_datasets_list
-        self.test_datasets = test_datasets_list
-        self.data_dir = data_dir
+        self.train_datasets_list = list(
+            filter(lambda file_name: "train" in file_name, vectorized_data.keys())
+        )
+        self.test_dev_datasets_list = list(
+            filter(
+                lambda file_name: "dev" in file_name or "test" in file_name,
+                vectorized_data.keys(),
+            )
+        )
+        self.vectorized_data = vectorized_data
         self.glue_format = glue_format
         self.data_sort = data_sort
         self.tasks = {}
@@ -75,7 +79,7 @@ def _process_train_datasets(self):
         logger.info("Starting to process the training data sets")
 
         train_datasets = []
-        for dataset in self.train_datasets:
+        for dataset in self.train_datasets_list:
             prefix = dataset.split("_")[0]
             if prefix in self.tasks:
                 continue
@@ -121,13 +125,14 @@ def _process_train_datasets(self):
             dropout_p = self.task_defs.dropout_p_map.get(prefix, self.config.dropout_p)
             self.dropout_list.append(dropout_p)
 
-            train_path = os.path.join(self.data_dir, f"{dataset}_train.json")
-            assert os.path.exists(
-                train_path
-            ), f"[ERROR] - Training dataset does not exist"
-            logger.info(f"Loading {train_path} as task {task_id}")
+            train_data = self.vectorized_data[dataset]
+            assert (
+                train_data
+            ), f"[ERROR] - Training dataset for {dataset} does not exist."
+
+            logger.info(f"Loading {dataset} as task {task_id}")
             train_data_set = MTDNNSingleTaskDataset(
-                train_path,
+                train_data,
                 True,
                 maxlen=self.config.max_seq_len,
                 task_id=task_id,
@@ -165,7 +170,7 @@ def _process_dev_test_datasets(self):
         test_collater = MTDNNCollater(
             is_train=False, encoder_type=self.config.encoder_type
         )
-        for dataset in self.test_datasets:
+        for dataset in self.test_dev_datasets_list:
             prefix = dataset.split("_")[0]
             task_id = (
                 self.tasks_class[self.task_defs.n_class_map[prefix]]
@@ -181,46 +186,30 @@ def _process_dev_test_datasets(self):
             assert prefix in self.task_defs.data_type_map
             data_type = self.task_defs.data_type_map[prefix]
 
-            dev_path = os.path.join(self.data_dir, f"{dataset}_dev.json")
-            assert os.path.exists(
-                dev_path
-            ), f"[ERROR] - Dev dataset does not exist: {dev_path}"
-            dev_data = None
-            if os.path.exists(dev_path):
-                dev_data_set = MTDNNSingleTaskDataset(
-                    dev_path,
-                    False,
-                    maxlen=self.config.max_seq_len,
-                    task_id=task_id,
-                    task_type=task_type,
-                    data_type=data_type,
-                )
-                dev_data = DataLoader(
-                    dev_data_set,
-                    batch_size=self.config.batch_size_eval,
-                    collate_fn=test_collater.collate_fn,
-                    pin_memory=self.config.cuda,
-                )
-            dev_dataloaders_list.append(dev_data)
-
-            test_path = os.path.join(self.data_dir, f"{dataset}_test.json")
-            test_data = None
-            if os.path.exists(test_path):
-                test_data_set = MTDNNSingleTaskDataset(
-                    test_path,
-                    False,
-                    maxlen=self.config.max_seq_len,
-                    task_id=task_id,
-                    task_type=task_type,
-                    data_type=data_type,
-                )
-                test_data = DataLoader(
-                    test_data_set,
-                    batch_size=self.config.batch_size_eval,
-                    collate_fn=test_collater.collate_fn,
-                    pin_memory=self.config.cuda,
-                )
-            test_dataloaders_list.append(test_data)
+            # Process datasets for the dev and tests
+            data_rows = self.vectorized_data[dataset]
+            assert (
+                data_rows
+            ), f"[ERROR] - Test/Dev dataset for {dataset} does not exist."
+            logger.info(f"Loading {dataset} as task {task_id}")
+            data_set = MTDNNSingleTaskDataset(
+                data_rows,
+                False,
+                maxlen=self.config.max_seq_len,
+                task_id=task_id,
+                task_type=task_type,
+                data_type=data_type,
+            )
+            data_loader = DataLoader(
+                data_set,
+                batch_size=self.config.batch_size_eval,
+                collate_fn=test_collater.collate_fn,
+                pin_memory=self.config.cuda,
+            )
+
+            dev_dataloaders_list.append(
+                data_loader
+            ) if "dev" in dataset else test_dataloaders_list.append(data_loader)
 
         # Return tuple of dev and test dataloaders
         return dev_dataloaders_list, test_dataloaders_list
diff --git a/mtdnn/tasks/config.py b/mtdnn/tasks/config.py
index 1db2b00..97593bf 100644
--- a/mtdnn/tasks/config.py
+++ b/mtdnn/tasks/config.py
@@ -14,16 +14,25 @@
 import os
 from typing import Union
 
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import yaml
 
+from mtdnn.common.san import SANClassifier
+
 from mtdnn.common.loss import LossCriterion
 from mtdnn.common.metrics import Metric
-from mtdnn.common.types import DataFormat, EncoderModelType, TaskType
-from mtdnn.common.vocab import Vocabulary
+from mtdnn.common.types import DataFormat, EncoderModelType, TaskDefType, TaskType
 from mtdnn.common.utils import MTDNNCommonUtils
+from mtdnn.common.vocab import Vocabulary
+
 
+logger = MTDNNCommonUtils.create_logger(__name__, to_disk=True)
 
-logger = MTDNNCommonUtils.setup_logging()
+TASK_REGISTRY = {}
+TASK_CLASS_NAMES = set()
 
 
 class TaskConfig(object):
@@ -39,10 +48,40 @@ def __init__(self, **kwargs: dict):
         """ Define a generic task configuration """
         logger.info("Mapping Task attributes")
 
+        # assert data exists for preprocessing
+        assert (
+            "data_source_dir" in kwargs
+        ), "[ERROR] - Source data directory with data splits not provided"
+        assert (
+            kwargs["data_source_dir"] and type(kwargs["data_source_dir"]) == str
+        ), "[ERROR] - Source data directory path must be a string"
+        assert kwargs[
+            "data_source_dir"
+        ], "[ERROR] - Source data directory path cannot be empty"
+        assert os.path.isdir(
+            kwargs["data_source_dir"]
+        ), "[ERROR] - Source data directory path does not exist"
+
+        assert all(
+            os.path.exists(os.path.join(kwargs["data_source_dir"], f"{split}.tsv"))
+            for split in kwargs["split_names"]
+        ), f"[ERROR] - All data splits do not exist in path - {kwargs['data_source_dir']}"
+
+        assert kwargs[
+            "data_process_opts"
+        ], "[ERROR] - Source data processing options must be set"
+
         # Mapping attributes
         for key, value in kwargs.items():
             try:
-                setattr(self, key, value)
+                if key == "data_source_dir":
+                    data_paths = [
+                        os.path.join(kwargs["data_source_dir"], f"{split}.tsv")
+                        for split in kwargs["split_names"]
+                    ]
+                    setattr(self, "data_paths", data_paths)
+                else:
+                    setattr(self, key, value)
             except AttributeError as err:
                 logger.error(
                     f"[ERROR] - Unable to set {key} with value {value} for {self}"
@@ -244,7 +283,7 @@ def __init__(self, kwargs: dict = {}):
                 "task_name": "stsb",
                 "data_format": "PremiseAndOneHypothesis",
                 "encoder_type": "BERT",
-                "enable_san": false,
+                "enable_san": False,
                 "metric_meta": ["Pearson", "Spearman"],
                 "n_class": 1,
                 "loss": "MseCriterion",
@@ -537,6 +576,23 @@ def __init__(self, kwargs: dict = {}):
         self.dropout_p = kwargs.pop("dropout_p", 0.1)
 
 
+class MaskLMTaskConfig(TaskConfig):
+    def __init__(self, kwargs: dict = {}):
+        if not kwargs:
+            kwargs = {
+                "task_name": "MaskLM",
+                "data_format": "MLM",
+                "encoder_type": "BERT",
+                "enable_san": False,
+                "metric_meta": ["ACC"],
+                "n_class": 30522,
+                "task_type": "MaskLM",
+                "loss": "MlmCriterion",
+                "split_names": ["train", "dev"],
+            }
+        super(MaskLMTaskConfig, self).__init__(**kwargs)
+
+
 # Map of supported tasks
 SUPPORTED_TASKS_MAP = {
     "cola": COLATaskConfig,
@@ -555,6 +611,7 @@ def __init__(self, kwargs: dict = {}):
     "chunk": CHUNKTaskConfig,
     "squad": SQUADTaskConfig,
     "squad-v2": SQUADTaskConfig,
+    "masklm": MaskLMTaskConfig,
 }
 
 
@@ -598,7 +655,14 @@ class MTDNNTaskDefs:
                     "loss": "CeCriterion",
                     "kd_loss": "MseCriterion",
                     "n_class": 2,
-                    "task_type": "Classification"
+                    "split_names": ["train", "test", "dev"],
+                    "data_paths": ["CoLA/train.tsv","CoLA/dev.tsv","CoLA/test.tsv"],
+                    "data_opts": {
+                        "header": True,
+                        "is_train": True,
+                        "multi_snli": False,
+                    },
+                    "task_type": "Classification",
                 }
                 ...
             }
@@ -618,8 +682,15 @@ class MTDNNTaskDefs:
                         "loss": "CeCriterion",
                         "kd_loss": "MseCriterion",
                         "n_class": 2,
-                        "task_type": "Classification"
-                    }
+                        "split_names": ["train", "test", "dev"],
+                        "data_paths": ["CoLA/train.tsv","CoLA/dev.tsv","CoLA/test.tsv"],
+                        "data_opts": {
+                            "header": True,
+                            "is_train": True,
+                            "multi_snli": False,
+                        },
+                        "task_type": "Classification",
+                }
                 ...
             }
 
@@ -664,6 +735,8 @@ def __init__(self, task_dict_or_file: Union[str, dict]):
         encoderType_map = {}
         loss_map = {}
         kd_loss_map = {}
+        data_paths_map = {}
+        split_names_map = {}
 
         # Create an instance of task creator singleton
         task_creator = MTDNNTaskConfig()
@@ -693,6 +766,10 @@ def __init__(self, task_dict_or_file: Union[str, dict]):
                     label_mapper.add(label)
                 global_map[name] = label_mapper
 
+            # split names
+            if hasattr(task, "split_names"):
+                split_names_map[name] = task.split_names
+
             # dropout
             if hasattr(task, "dropout_p"):
                 dropout_p_map[name] = task.dropout_p
@@ -712,6 +789,13 @@ def __init__(self, task_dict_or_file: Union[str, dict]):
             else:
                 kd_loss_map[name] = None
 
+            # Map train, test (and dev) data paths
+            data_paths_map[name] = {
+                "data_paths": task.data_paths or [],
+                "data_opts": task.data_process_opts
+                or {"header": True, "is_train": True, "multi_snli": False,},
+            }
+
             # Track configured tasks for downstream
             self._configured_tasks.append(task.to_dict())
 
@@ -730,11 +814,210 @@ def __init__(self, task_dict_or_file: Union[str, dict]):
         self.encoderType = uniq_encoderType.pop()
         self.loss_map = loss_map
         self.kd_loss_map = kd_loss_map
+        self.data_paths_map = data_paths_map
+        self.split_names_map = split_names_map
 
-    def get_configured_tasks(self) -> list:
-        """Returns a list of configured tasks by TaskDefs class from the input configuration file
+    def get_configured_tasks(self):
+        """Returns a list of configured tasks objects by TaskDefs class from the input configuration file
         
         Returns:
             list -- List of configured task classes
         """
         return self._configured_tasks
+
+    def get_task_names(self):
+        """ Returns a list of configured task names
+        
+        Returns:
+            list -- List of configured task classes
+        """
+        return self.task_type_map.keys()
+
+    def get_task_def(self, task_name: str = ""):
+        """Returns a dictionary of parameters for specified task
+
+        Keyword Arguments:
+            task_name {str} -- Task name for definition to get (default: {""})
+
+        Returns:
+            dict -- Task definition for specified task
+        """
+        assert task_name in self.task_type_map, "[ERROR] - Task is not configured"
+        # return {
+        #     k: v
+        #     for ele in self.get_configured_tasks()
+        #     for k, v in ele.items()
+        #     if ele["task_name"] == task_name
+        # }
+        return TaskDefType(
+            self.global_map.get(task_name, None),
+            self.n_class_map[task_name],
+            self.data_type_map[task_name],
+            self.task_type_map[task_name],
+            self.metric_meta_map[task_name],
+            self.split_names_map[task_name],
+            self.enable_san_map[task_name],
+            self.dropout_p_map.get(task_name, None),
+            self.loss_map[task_name],
+            self.kd_loss_map[task_name],
+            self.data_paths_map[task_name],
+        )
+
+
+class MTDNNTask:
+    def __init__(self, task_def):
+        self._task_def = task_def
+
+    def input_parse_label(self, label: str):
+        raise NotImplementedError()
+
+    @staticmethod
+    def input_is_valid_sample(sample, max_len):
+        return len(sample["token_id"]) <= max_len
+
+    @staticmethod
+    def train_prepare_label(labels):
+        raise NotImplementedError()
+
+    @staticmethod
+    def train_prepare_soft_label(softlabels):
+        raise NotImplementedError()
+
+    @staticmethod
+    def train_build_task_layer(decoder_opt, hidden_size, lab, opt, prefix, dropout):
+        if decoder_opt == 1:
+            out_proj = SANClassifier(
+                hidden_size, hidden_size, lab, opt, prefix, dropout=dropout
+            )
+        else:
+            out_proj = nn.Linear(hidden_size, lab)
+        return out_proj
+
+    @staticmethod
+    def train_forward(
+        sequence_output,
+        pooled_output,
+        premise_mask,
+        hyp_mask,
+        decoder_opt,
+        dropout_layer,
+        task_layer,
+    ):
+        if decoder_opt == 1:
+            max_query = hyp_mask.size(1)
+            assert max_query > 0
+            assert premise_mask is not None
+            assert hyp_mask is not None
+            hyp_mem = sequence_output[:, :max_query, :]
+            logits = task_layer(sequence_output, hyp_mem, premise_mask, hyp_mask)
+        else:
+            pooled_output = dropout_layer(pooled_output)
+            logits = task_layer(pooled_output)
+        return logits
+
+    @staticmethod
+    def test_prepare_label(batch_info, labels):
+        batch_info["label"] = labels
+
+    @staticmethod
+    def test_predict(score):
+        raise NotImplementedError()
+
+
+def register_task(name):
+    """
+        @register_task('Classification')
+        class ClassificationTask(MTDNNTask):
+            (...)
+
+    .. note::
+
+        All Tasks must implement the :class:`~MTDNNTask`
+        interface.
+
+    Args:
+        name (str): the name of the task
+    """
+
+    def register_task_cls(cls):
+        if name in TASK_REGISTRY:
+            raise ValueError("Cannot register duplicate task ({})".format(name))
+        if not issubclass(cls, MTDNNTask):
+            raise ValueError(
+                "Task ({}: {}) must extend MTDNNTask".format(name, cls.__name__)
+            )
+        if cls.__name__ in TASK_CLASS_NAMES:
+            raise ValueError(
+                "Cannot register task with duplicate class name ({})".format(
+                    cls.__name__
+                )
+            )
+        TASK_REGISTRY[name] = cls
+        TASK_CLASS_NAMES.add(cls.__name__)
+        return cls
+
+    return register_task_cls
+
+
+def get_task_obj(task_def):
+    task_name = task_def.task_type.name
+    task_cls = TASK_REGISTRY.get(task_name, None)
+    if task_cls is None:
+        return None
+
+    return task_cls(task_def)
+
+
+@register_task("Regression")
+class RegressionTask(MTDNNTask):
+    def __init__(self, task_def):
+        super().__init__(task_def)
+
+    def input_parse_label(self, label: str):
+        return float(label)
+
+    @staticmethod
+    def train_prepare_label(labels):
+        return torch.FloatTensor(labels)
+
+    @staticmethod
+    def train_prepare_soft_label(softlabels):
+        return torch.FloatTensor(softlabels)
+
+    @staticmethod
+    def test_predict(score):
+        score = score.data.cpu()
+        score = score.numpy()
+        predict = np.argmax(score, axis=1).tolist()
+        score = score.reshape(-1).tolist()
+        return score, predict
+
+
+@register_task("Classification")
+class ClassificationTask(MTDNNTask):
+    def __init__(self, task_def):
+        super().__init__(task_def)
+
+    def input_parse_label(self, label: str):
+        label_dict = self._task_def.label_vocab
+        if label_dict is not None:
+            return label_dict[label]
+        else:
+            return int(label)
+
+    @staticmethod
+    def train_prepare_label(labels):
+        return torch.LongTensor(labels)
+
+    @staticmethod
+    def train_prepare_soft_label(softlabels):
+        return torch.FloatTensor(softlabels)
+
+    @staticmethod
+    def test_predict(score):
+        score = F.softmax(score, dim=1)
+        score = score.data.cpu()
+        score = score.numpy()
+        predict = np.argmax(score, axis=1).tolist()
+        score = score.reshape(-1).tolist()
+        return score, predict
diff --git a/mtdnn/tasks/mlm_utils.py b/mtdnn/tasks/mlm_utils.py
new file mode 100644
index 0000000..99ea3f5
--- /dev/null
+++ b/mtdnn/tasks/mlm_utils.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright (c) Microsoft. All rights reserved.
+# Code is adpated from https://github.com/google-research/bert
+import collections
+import json
+import os
+from collections import namedtuple
+from typing import Union
+
+MaskedLmInstance = namedtuple("MaskedLmInstance", ["index", "label"])
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+class TrainingInstance(object):
+    def __init__(
+        self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, is_random_next
+    ):
+        self.tokens = tokens
+        self.segment_ids = segment_ids
+        self.is_random_next = is_random_next
+        self.masked_lm_positions = masked_lm_positions
+        self.masked_lm_labels = masked_lm_labels
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def load_loose_json(load_path_or_vectorized_data: Union[str, list]):
+    """Load vectorized jsonl data. Takes either path to the file or iterable
+
+    Arguments:
+        load_path_or_vectorized_data {Union[str, list]} -- Path to vectorized data or iterable containing data
+
+    Raises:
+        ValueError: Error in input
+
+    Returns:
+        list -- Loaded data
+    """
+    if isinstance(load_path_or_vectorized_data, str):
+        assert os.path.exists(
+            load_path_or_vectorized_data
+        ), "[ERROR] - Load path does not exist."
+        data = open(load_path_or_vectorized_data, "r", encoding="utf-8")
+    elif isinstance(load_path_or_vectorized_data, list):
+        data = load_path_or_vectorized_data
+    else:
+        raise ValueError(load_path_or_vectorized_data)
+    rows = []
+    for line in data:
+        rows.append(json.loads(line))
+    return rows
+
+
+def create_masked_lm_predictions(
+    tokens,
+    masked_lm_prob,
+    max_predictions_per_seq,
+    vocab_words,
+    rng,
+    do_whole_word_mask=True,
+):
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        # Whole Word Masking means that if we mask all of the wordpieces
+        # corresponding to an original word. When a word has been split into
+        # WordPieces, the first token does not have any marker and any subsequence
+        # tokens are prefixed with ##. So whenever we see the ## token, we
+        # append it to the previous set of word indexes.
+        #
+        # Note that Whole Word Masking does *not* change the training code
+        # at all -- we still predict each WordPiece independently, softmaxed
+        # over the entire vocabulary.
+        if do_whole_word_mask and len(cand_indexes) >= 1 and token.startswith("##"):
+            cand_indexes[-1].append(i)
+        else:
+            cand_indexes.append([i])
+    rng.shuffle(cand_indexes)
+    output_tokens = list(tokens)
+    num_to_predict = min(
+        max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))
+    )
+
+    masked_lms = []
+    covered_indexes = set()
+    for index_set in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        # If adding a whole-word mask would exceed the maximum number of
+        # predictions, then just skip this candidate.
+        if len(masked_lms) + len(index_set) > num_to_predict:
+            continue
+        is_any_index_covered = False
+        for index in index_set:
+            if index in covered_indexes:
+                is_any_index_covered = True
+                break
+        if is_any_index_covered:
+            continue
+        for index in index_set:
+            covered_indexes.add(index)
+            masked_token = None
+            # 80% of the time, replace with [MASK]
+            if rng.random() < 0.8:
+                masked_token = "[MASK]"
+            else:
+                # 10% of the time, keep original
+                if rng.random() < 0.5:
+                    masked_token = tokens[index]
+                # 10% of the time, replace with random word
+                else:
+                    masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+            output_tokens[index] = masked_token
+            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+    assert len(masked_lms) <= num_to_predict
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+    masked_lm_positions = []
+    masked_lm_labels = []
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def create_instances_from_document(
+    all_documents,
+    document_index,
+    max_seq_length,
+    short_seq_prob,
+    masked_lm_prob,
+    max_predictions_per_seq,
+    vocab_words,
+    rng,
+):
+    document = all_documents[document_index]
+
+    # Account for [CLS], [SEP], [SEP]
+    max_num_tokens = max_seq_length - 3
+    target_seq_length = max_num_tokens
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # `a_end` is how many segments from `current_chunk` go into the `A`
+                # (first) sentence.
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend(current_chunk[j])
+
+                tokens_b = []
+                # Random next
+                is_random_next = False
+
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+                    for _ in range(10):
+                        random_document_index = rng.randint(0, len(all_documents) - 1)
+                        if random_document_index != document_index:
+                            break
+                    random_document = all_documents[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend(random_document[j])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                (
+                    tokens,
+                    masked_lm_positions,
+                    masked_lm_labels,
+                ) = create_masked_lm_predictions(
+                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng
+                )
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels,
+                )
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+
+    return instances
diff --git a/mtdnn/tasks/utils.py b/mtdnn/tasks/utils.py
new file mode 100644
index 0000000..0e80a8d
--- /dev/null
+++ b/mtdnn/tasks/utils.py
@@ -0,0 +1,504 @@
+# coding=utf-8
+# Copyright (c) Microsoft. All rights reserved.
+import os
+import pdb
+from random import shuffle
+from sys import path
+
+from mtdnn.common.metrics import calc_metrics
+from mtdnn.common.types import DataFormat
+
+
+def process_data_and_dump_rows(
+    rows: list,
+    out_path: str,
+    data_format: DataFormat,
+    write_mode: str = "w",
+    dump_rows: bool = False,
+) -> None:
+    """
+        Output files should have following format
+        :param rows: data
+        :param out_path: output file path
+        :return: processed_rows: List of string rows 
+    """
+    processed_rows = []
+    for row in rows:
+        data = ""
+        if data_format in [DataFormat.PremiseOnly, DataFormat.Sequence]:
+            for col in ["uid", "label", "premise"]:
+                if "\t" in str(row[col]):
+                    pdb.set_trace()
+            data = f"{row['uid']}\t{row['label']}\t{row['premise']}\n"
+        elif data_format == DataFormat.PremiseAndOneHypothesis:
+            for col in ["uid", "label", "premise", "hypothesis"]:
+                if "\t" in str(row[col]):
+                    pdb.set_trace()
+            data = (
+                f"{row['uid']}\t{row['label']}\t{row['premise']}\t{row['hypothesis']}\n"
+            )
+        elif data_format == DataFormat.PremiseAndMultiHypothesis:
+            for col in ["uid", "label", "premise"]:
+                if "\t" in str(row[col]):
+                    pdb.set_trace()
+            hypothesis = row["hypothesis"]
+            for one_hypo in hypothesis:
+                if "\t" in str(one_hypo):
+                    pdb.set_trace()
+            hypothesis = "\t".join(hypothesis)
+            data = f"{row['uid']}\t{row['ruid']}\t{row['label']}\t{row['premise']}\t{hypothesis}\n"
+        else:
+            raise ValueError(data_format)
+        processed_rows.append(data)
+
+    # Save data if dump_rows is true
+    if dump_rows:
+        with open(out_path, mode=write_mode, encoding="utf-8") as out_f:
+            out_f.writelines(processed_rows)
+    return processed_rows
+
+
+def load_scitail(file_path, kwargs: dict = {}):
+    """ Loading scitail """
+
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            blocks = line.strip().split("\t")
+            assert len(blocks) > 2
+            if blocks[0] == "-":
+                continue
+            sample = {
+                "uid": str(cnt),
+                "premise": blocks[0],
+                "hypothesis": blocks[1],
+                "label": blocks[2],
+            }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_snli(file_path, kwargs: dict = {}):
+    """ Load SNLI """
+    header = kwargs.get("header", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            assert len(blocks) > 10
+            if blocks[-1] == "-":
+                continue
+            lab = blocks[-1]
+            if lab is None:
+                import pdb
+
+                pdb.set_trace()
+            sample = {
+                "uid": blocks[0],
+                "premise": blocks[7],
+                "hypothesis": blocks[8],
+                "label": lab,
+            }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_mnli(file_path, kwargs: dict = {}):
+    """ Load MNLI """
+    header = kwargs.get("header", True)
+    multi_snli = kwargs.get("multi_snli", False)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            assert len(blocks) > 9
+            if blocks[-1] == "-":
+                continue
+            lab = "contradiction"
+            if is_train:
+                lab = blocks[-1]
+            if lab is None:
+                import pdb
+
+                pdb.set_trace()
+            sample = {
+                "uid": blocks[0],
+                "premise": blocks[8],
+                "hypothesis": blocks[9],
+                "label": lab,
+            }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_mrpc(file_path, kwargs: dict = {}):
+    """ Load MRPC """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            assert len(blocks) > 4
+            lab = 0
+            if is_train:
+                lab = int(blocks[0])
+            sample = {
+                "uid": cnt,
+                "premise": blocks[-2],
+                "hypothesis": blocks[-1],
+                "label": lab,
+            }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_qnli(file_path, kwargs: dict = {}):
+    """ Load QNLI for classification"""
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            assert len(blocks) > 2
+            lab = "not_entailment"
+            if is_train:
+                lab = blocks[-1]
+            if lab is None:
+                import pdb
+
+                pdb.set_trace()
+            sample = {
+                "uid": blocks[0],
+                "premise": blocks[1],
+                "hypothesis": blocks[2],
+                "label": lab,
+            }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_qqp(file_path, kwargs: dict = {}):
+    """ Load QQP """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    skipped = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            if is_train and len(blocks) < 6:
+                skipped += 1
+                continue
+            if not is_train:
+                assert len(blocks) == 3
+            lab = 0
+            if is_train:
+                lab = int(blocks[-1])
+                sample = {
+                    "uid": cnt,
+                    "premise": blocks[-3],
+                    "hypothesis": blocks[-2],
+                    "label": lab,
+                }
+            else:
+                sample = {
+                    "uid": int(blocks[0]),
+                    "premise": blocks[-2],
+                    "hypothesis": blocks[-1],
+                    "label": lab,
+                }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_rte(file_path, kwargs: dict = {}):
+    """ Load RTE """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            if is_train and len(blocks) < 4:
+                continue
+            if not is_train:
+                assert len(blocks) == 3
+            lab = "not_entailment"
+            if is_train:
+                lab = blocks[-1]
+                sample = {
+                    "uid": int(blocks[0]),
+                    "premise": blocks[-3],
+                    "hypothesis": blocks[-2],
+                    "label": lab,
+                }
+            else:
+                sample = {
+                    "uid": int(blocks[0]),
+                    "premise": blocks[-2],
+                    "hypothesis": blocks[-1],
+                    "label": lab,
+                }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_wnli(file_path, kwargs: dict = {}):
+    """ Load WNLI """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            if is_train and len(blocks) < 4:
+                continue
+            if not is_train:
+                assert len(blocks) == 3
+            lab = 0
+            if is_train:
+                lab = int(blocks[-1])
+                sample = {
+                    "uid": cnt,
+                    "premise": blocks[-3],
+                    "hypothesis": blocks[-2],
+                    "label": lab,
+                }
+            else:
+                sample = {
+                    "uid": cnt,
+                    "premise": blocks[-2],
+                    "hypothesis": blocks[-1],
+                    "label": lab,
+                }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_sst(file_path, kwargs: dict = {}):
+    """ Load SST """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            if is_train and len(blocks) < 2:
+                continue
+            lab = 0
+            if is_train:
+                lab = int(blocks[-1])
+                sample = {"uid": cnt, "premise": blocks[0], "label": lab}
+            else:
+                sample = {"uid": int(blocks[0]), "premise": blocks[1], "label": lab}
+
+            cnt += 1
+            rows.append(sample)
+    return rows
+
+
+def load_cola(file_path, kwargs: dict = {}):
+    """ Load COLA """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            if is_train and len(blocks) < 2:
+                continue
+            lab = 0
+            if is_train:
+                lab = int(blocks[1])
+                sample = {"uid": cnt, "premise": blocks[-1], "label": lab}
+            else:
+                sample = {"uid": cnt, "premise": blocks[-1], "label": lab}
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_stsb(file_path, kwargs: dict = {}):
+    """ Load STSB """
+
+    header = kwargs.get("header", True)
+    is_train = kwargs.get("is_train", True)
+    rows = []
+    cnt = 0
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            if header:
+                header = False
+                continue
+            blocks = line.strip().split("\t")
+            assert len(blocks) > 8
+            score = "0.0"
+            if is_train:
+                score = blocks[-1]
+                sample = {
+                    "uid": cnt,
+                    "premise": blocks[-3],
+                    "hypothesis": blocks[-2],
+                    "label": score,
+                }
+            else:
+                sample = {
+                    "uid": cnt,
+                    "premise": blocks[-2],
+                    "hypothesis": blocks[-1],
+                    "label": score,
+                }
+            rows.append(sample)
+            cnt += 1
+    return rows
+
+
+def load_conll_ner(file_path, kwargs: dict = {}):
+    """ Load NER """
+
+    rows = []
+    cnt = 0
+    sentence = []
+    label = []
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0 or line.startswith("-DOCSTART") or line[0] == "\n":
+                if len(sentence) > 0:
+                    sample = {"uid": cnt, "premise": sentence, "label": label}
+                    rows.append(sample)
+                    sentence = []
+                    label = []
+                    cnt += 1
+                continue
+            splits = line.split(" ")
+            sentence.append(splits[0])
+            label.append(splits[-1])
+        if len(sentence) > 0:
+            sample = {"uid": cnt, "premise": sentence, "label": label}
+    return rows
+
+
+def load_conll_pos(file_path, kwargs: dict = {}):
+    """ Load POS """
+
+    rows = []
+    cnt = 0
+    sentence = []
+    label = []
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0 or line.startswith("-DOCSTART") or line[0] == "\n":
+                if len(sentence) > 0:
+                    sample = {"uid": cnt, "premise": sentence, "label": label}
+                    rows.append(sample)
+                    sentence = []
+                    label = []
+                    cnt += 1
+                continue
+            splits = line.split(" ")
+            sentence.append(splits[0])
+            label.append(splits[1])
+        if len(sentence) > 0:
+            sample = {"uid": cnt, "premise": sentence, "label": label}
+    return rows
+
+
+def load_conll_chunk(file_path, kwargs: dict = {}):
+    """ Load CHUNK """
+
+    rows = []
+    cnt = 0
+    sentence = []
+    label = []
+    with open(file_path, encoding="utf8") as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0 or line.startswith("-DOCSTART") or line[0] == "\n":
+                if len(sentence) > 0:
+                    sample = {"uid": cnt, "premise": sentence, "label": label}
+                    rows.append(sample)
+                    sentence = []
+                    label = []
+                    cnt += 1
+                continue
+            splits = line.split(" ")
+            sentence.append(splits[0])
+            label.append(splits[2])
+        if len(sentence) > 0:
+            sample = {"uid": cnt, "premise": sentence, "label": label}
+    return rows
+
+
+def submit(path, data, label_dict=None):
+    header = "index\tprediction"
+    with open(path, "w") as writer:
+        predictions, uids = data["predictions"], data["uids"]
+        writer.write("{}\n".format(header))
+        assert len(predictions) == len(uids)
+        # sort label
+        paired = [(int(uid), predictions[idx]) for idx, uid in enumerate(uids)]
+        paired = sorted(paired, key=lambda item: item[0])
+        for uid, pred in paired:
+            if label_dict is None:
+                writer.write("{}\t{}\n".format(uid, pred))
+            else:
+                assert type(pred) is int
+                writer.write("{}\t{}\n".format(uid, label_dict[pred]))
diff --git a/mtdnn/tokenizer_mtdnn.py b/mtdnn/tokenizer_mtdnn.py
new file mode 100644
index 0000000..7744baf
--- /dev/null
+++ b/mtdnn/tokenizer_mtdnn.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from mtdnn.common.types import EncoderModelType
+from mtdnn.modeling_mtdnn import MODEL_CLASSES
+from mtdnn.tasks.config import MTDNNTaskDefs
+
+
+class MTDNNTokenizer:
+    """Wraps the hugging face transformer tokenizer for Preprocessing GLUE/SNLI/SciTail datasets."""
+
+    def __init__(
+        self, model_name: str = "bert-base-uncased", do_lower_case: bool = False,
+    ):
+        self._model_name = model_name
+        self.literal_model_name = model_name.split("-")[0]
+        self.model_type = EncoderModelType[
+            self.literal_model_name.upper()
+        ].name  # BERT = 1, ROBERTA = 2
+        mt_dnn_model_name_fmt = model_name.replace("-", "_")  # format to mt-dnn format
+        self.mt_dnn_suffix = (
+            f"{mt_dnn_model_name_fmt}_lower"
+            if do_lower_case
+            else f"{mt_dnn_model_name_fmt}"
+        )
+        _, _, tokenizer_class = MODEL_CLASSES[self.literal_model_name]
+        self._tokenizer = tokenizer_class.from_pretrained(
+            model_name, do_lower_case=do_lower_case
+        )
+
+    def encode(
+        self,
+        text: str = "",
+        text_pair: str = "",
+        max_length: int = 512,
+        enable_padding: bool = False,
+        pad_on_left: bool = False,
+        pad_token: int = 0,
+        pad_token_segment_id: int = 0,
+        mask_padding_with_zero: bool = False,
+    ):
+        """
+            Returns a tuple containing the encoded sequence or sequence pair and additional informations:
+            the input mask and segment id
+        """
+        # set mask_padding_with_zero default value as False to keep consistent with original setting
+        inputs = self._tokenizer.encode_plus(
+            text, text_pair, add_special_tokens=True, max_length=max_length,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_length - len(input_ids)
+
+        if enable_padding:
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = (
+                    [0 if mask_padding_with_zero else 1] * padding_length
+                ) + attention_mask
+                token_type_ids = (
+                    [pad_token_segment_id] * padding_length
+                ) + token_type_ids
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + (
+                    [0 if mask_padding_with_zero else 1] * padding_length
+                )
+                token_type_ids = token_type_ids + (
+                    [pad_token_segment_id] * padding_length
+                )
+
+            assert (
+                len(input_ids) == max_length
+            ), f"[ERROR] - Input Ids length: {len(input_ids)} does not match max length: {max_length}"
+
+            assert (
+                len(attention_mask) == max_length
+            ), f"[ERROR] - Attention mask length: {len(attention_mask)} does not match max length: {max_length}"
+
+            assert (
+                len(token_type_ids) == max_length
+            ), f"[ERROR] - Token types id length: {len(token_type_ids)} does not match max length: {max_length}"
+
+        if self.model_type.lower() in ["bert", "roberta"]:
+            attention_mask = None
+
+        if self.model_type.lower() not in ["distilbert", "bert", "xlnet"]:
+            token_type_ids = [0] * len(token_type_ids)
+
+        # input_ids, input_mask, segment_id
+        return (
+            input_ids,
+            attention_mask,
+            token_type_ids,
+        )
+
+    def get_model_name(self) -> str:
+        return self._model_name
diff --git a/sample_data/MNLI/README.txt b/sample_data/MNLI/README.txt
new file mode 100644
index 0000000..c7b8beb
--- /dev/null
+++ b/sample_data/MNLI/README.txt
@@ -0,0 +1,26 @@
+This is the 1.0 distribution of the The Multi-genre NLI (MultiNLI) Corpus.
+
+License information and a detailed description of the corpus are included in the accompanying PDF.
+
+If you use this corpus, please cite the attached data description paper.
+
+@InProceedings{williams2018broad,
+  author    = {Williams, Adina and Nangia, Nikita and Bowman, Samuel R.},
+  title     = {A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},
+  booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+  year      = {2018},
+  publisher = {Association for Computational Linguistics},
+}
+
+
+Project page: https://www.nyu.edu/projects/bowman/multinli/
+
+
+Release Notes
+-------------
+
+1.0:
+- Replaces values in pairID and promptID fields. PromptID values are now shared across examples
+  that were collected using the same prompt, as was originally intended, and pairID values are
+  simply promptID values with an extra letter indicating the specific field in the prompt that was
+  used. If you do not use these fields, this release is equivalent to 0.9.
diff --git a/sample_data/MNLI/dev_matched.tsv b/sample_data/MNLI/dev_matched.tsv
new file mode 100644
index 0000000..a280fda
--- /dev/null
+++ b/sample_data/MNLI/dev_matched.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9e1f16b216b949175e1be84fb21124b192acfcbb18686229512d6964fddaf60
+size 10469267
diff --git a/sample_data/MNLI/dev_mismatched.tsv b/sample_data/MNLI/dev_mismatched.tsv
new file mode 100644
index 0000000..04ecf76
--- /dev/null
+++ b/sample_data/MNLI/dev_mismatched.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcda39204dc0d12f4d1aa9fa0a45c838ff7303aa4ecb441ea85ec34d9ccb9234
+size 11016832
diff --git a/sample_data/MNLI/diagnostic-full.tsv b/sample_data/MNLI/diagnostic-full.tsv
new file mode 100644
index 0000000..47d0d2b
--- /dev/null
+++ b/sample_data/MNLI/diagnostic-full.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3e30cb6e5b8f229c0883f3da66d7378c6f037a5c89be9ca7d33929ab3333115
+size 265530
diff --git a/sample_data/MNLI/diagnostic.tsv b/sample_data/MNLI/diagnostic.tsv
new file mode 100644
index 0000000..46935ee
--- /dev/null
+++ b/sample_data/MNLI/diagnostic.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e13510b1bb14436ff7e2ee82338f0efb0133ecf2e73507a697dc210db3f05fd
+size 222257
diff --git a/sample_data/MNLI/test_matched.tsv b/sample_data/MNLI/test_matched.tsv
new file mode 100644
index 0000000..6c45b06
--- /dev/null
+++ b/sample_data/MNLI/test_matched.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:046c38a0d5135e8f1bf079858bd6b48635e76033167df67f20f782ae32e69a21
+size 9892077
diff --git a/sample_data/MNLI/test_mismatched.tsv b/sample_data/MNLI/test_mismatched.tsv
new file mode 100644
index 0000000..9669d06
--- /dev/null
+++ b/sample_data/MNLI/test_mismatched.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa5cd194edbe27ef7c673ce7e15206a82905e18b77c53d1fcb4bf92ce09a240e
+size 10349543
diff --git a/sample_data/MNLI/train.tsv b/sample_data/MNLI/train.tsv
new file mode 100644
index 0000000..9611575
--- /dev/null
+++ b/sample_data/MNLI/train.tsv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:546d2b36951bdb0aa48c72ff8cff03481689e22b571a31bfba9c3a0388177eef
+size 409801190
diff --git a/sample_data/bert_uncased_lower/mnli/mnli_matched_dev.json b/sample_data/bert_uncased_lower/mnli/mnli_matched_dev.json
deleted file mode 100644
index f66d162..0000000
--- a/sample_data/bert_uncased_lower/mnli/mnli_matched_dev.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:42ae01c7f32f50ddcef3d1c09ee589a384596ef9634e2c3778cb59204bd84fec
-size 4002207
diff --git a/sample_data/bert_uncased_lower/mnli/mnli_matched_test.json b/sample_data/bert_uncased_lower/mnli/mnli_matched_test.json
deleted file mode 100644
index cc16f07..0000000
--- a/sample_data/bert_uncased_lower/mnli/mnli_matched_test.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4361fb059fbcdb3727398ba99754f27e6f445f27a02344c86456c23a3b55de61
-size 4021462
diff --git a/sample_data/bert_uncased_lower/mnli/mnli_mismatched_dev.json b/sample_data/bert_uncased_lower/mnli/mnli_mismatched_dev.json
deleted file mode 100644
index e39dede..0000000
--- a/sample_data/bert_uncased_lower/mnli/mnli_mismatched_dev.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a1430dfbfb26254a5bc35fadf15839b48a099a827b0e2fbba0b0bcf2d92dcb0a
-size 4156909
diff --git a/sample_data/bert_uncased_lower/mnli/mnli_mismatched_test.json b/sample_data/bert_uncased_lower/mnli/mnli_mismatched_test.json
deleted file mode 100644
index 19db652..0000000
--- a/sample_data/bert_uncased_lower/mnli/mnli_mismatched_test.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8fcaa5a7ad081253619ae9f628c3e9e773d5bfba6178017c04092d015737a963
-size 4144939
diff --git a/sample_data/bert_uncased_lower/mnli/mnli_train.json b/sample_data/bert_uncased_lower/mnli/mnli_train.json
deleted file mode 100644
index b8914b8..0000000
--- a/sample_data/bert_uncased_lower/mnli/mnli_train.json
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:94ae34f18390c9d0f0a92e00d0312d09015c39ba4bfd6ff1c6ff6ec51b7728c6
-size 163466473
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/generate_conda_file.py b/scripts/generate_conda_file.py
index d1521a1..fb1be42 100644
--- a/scripts/generate_conda_file.py
+++ b/scripts/generate_conda_file.py
@@ -39,13 +39,13 @@
     "ipykernel": "ipykernel>=4.6.1",
     "jupyter": "jupyter>=1.0.0",
     "matplotlib": "matplotlib>=2.2.2",
-    "numpy": "numpy>=1.13.3",
+    "numpy": "numpy>=1.16.2",
     "pandas": "pandas>=0.24.2",
     "pytest": "pytest>=3.6.4",
     "pytorch": "pytorch-cpu>=1.0.0",
     "scipy": "scipy>=1.0.0",
     "h5py": "h5py>=2.8.0",
-    "tensorflow": "tensorflow==1.15.0",
+    "tensorflow": "tensorflow==1.15.2",
     "tensorflow-hub": "tensorflow-hub==0.7.0",
     "dask": "dask[dataframe]==1.2.2",
     "papermill": "papermill>=1.0.1",
@@ -67,7 +67,6 @@
     "pyemd": "pyemd==0.5.1",
     "ipywebrtc": "ipywebrtc==0.4.3",
     "pre-commit": "pre-commit>=1.14.4",
-    "scikit-learn": "scikit-learn>=0.19.0,<=0.20.3",
     "seaborn": "seaborn>=0.9.0",
     "sklearn-crfsuite": "sklearn-crfsuite>=0.3.6",
     "spacy": "spacy==2.1.8",
@@ -75,16 +74,25 @@
         "https://github.com/explosion/spacy-models/releases/download/"
         "en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz"
     ),
-    "transformers": "transformers>=2.1.1",
+    "transformers": "transformers==2.3.0",
     "gensim": "gensim>=3.7.0",
     "nltk": "nltk>=3.4",
-    "seqeval": "seqeval>=0.0.12",
+    "seqeval": "seqeval==0.0.12",
     "bertsum": "git+https://github.com/daden-ms/BertSum.git@030c139c97bc57d0c31f6515b8bf9649f999a443#egg=BertSum",
     "pyrouge": "pyrouge>=0.1.3",
     "py-rouge": "py-rouge>=1.1",
     "torchtext": "torchtext>=0.4.0",
     "multiprocess": "multiprocess==0.70.9",
     "tensorboardX": "tensorboardX==1.8",
+    "tensorboard": "tensorboard",
+    "colorlog": "colorlog",
+    "boto3": "boto3",
+    "regex": "regex",
+    "scikit-learn": "scikit-learn",
+    "pyyaml": "pyyaml",
+    "future": "future",
+    "fairseq": "fairseq==0.8.0",
+    "sentencepiece": "sentencepiece",    
 }
 
 PIP_GPU = {
diff --git a/scripts/generate_requirements_txt.py b/scripts/generate_requirements_txt.py
new file mode 100644
index 0000000..b1bacf3
--- /dev/null
+++ b/scripts/generate_requirements_txt.py
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This file outputs a requirements.txt based on the libraries defined in generate_conda_file.py
+from scripts.generate_conda_file import (
+    CONDA_BASE,
+    CONDA_GPU,
+    PIP_BASE,
+    PIP_GPU,
+    PIP_DARWIN,
+    PIP_LINUX,
+    PIP_WIN32,
+    CONDA_DARWIN,
+    CONDA_LINUX,
+    CONDA_WIN32,
+    PIP_DARWIN_GPU,
+    PIP_LINUX_GPU,
+    PIP_WIN32_GPU,
+    CONDA_DARWIN_GPU,
+    CONDA_LINUX_GPU,
+    CONDA_WIN32_GPU,
+)
+
+
+if __name__ == "__main__":
+    deps = list(CONDA_BASE.values())
+    deps += list(CONDA_GPU.values())
+    deps += list(PIP_BASE.values())
+    deps += list(PIP_GPU.values())
+    deps += list(PIP_DARWIN.values())
+    deps += list(PIP_LINUX.values())
+    deps += list(PIP_WIN32.values())
+    deps += list(CONDA_DARWIN.values())
+    deps += list(CONDA_LINUX.values())
+    deps += list(CONDA_WIN32.values())
+    deps += list(PIP_DARWIN_GPU.values())
+    deps += list(PIP_LINUX_GPU.values())
+    deps += list(PIP_WIN32_GPU.values())
+    deps += list(CONDA_DARWIN_GPU.values())
+    deps += list(CONDA_LINUX_GPU.values())
+    deps += list(CONDA_WIN32_GPU.values())
+    with open("requirements.txt", "w") as f:
+        f.write("\n".join(set(deps)))
+