From e04f12071beed50e2c0af353469d3dfc40055ce2 Mon Sep 17 00:00:00 2001
From: Taha Yusuf Ceritli <t.y.ceritli@sms.ed.ac.uk>
Date: Thu, 10 Sep 2020 23:04:58 +0300
Subject: [PATCH 1/4] added pretty print

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index f91976d..f215a87 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,7 @@ matplotlib
 nbval
 numpy
 pandas
+pprint
 scipy
 scikit-learn
 seaborn

From 4230b88ed222d449d605c9f76fe54bc1a168172d Mon Sep 17 00:00:00 2001
From: Taha Yusuf Ceritli <t.y.ceritli@sms.ed.ac.uk>
Date: Thu, 10 Sep 2020 23:05:41 +0300
Subject: [PATCH 2/4] added schema functions

---
 notebooks/demo_anomaly.ipynb | 32 ++++++++-----
 notebooks/demo_column.ipynb  | 33 +++++++++++---
 notebooks/demo_missing.ipynb | 16 +++----
 ptype/Ptype.py               | 88 +++++++++++++++++++++++++++++++++++-
 4 files changed, 141 insertions(+), 28 deletions(-)

diff --git a/notebooks/demo_anomaly.ipynb b/notebooks/demo_anomaly.ipynb
index 2e90197..22d3ba2 100644
--- a/notebooks/demo_anomaly.ipynb
+++ b/notebooks/demo_anomaly.ipynb
@@ -17,7 +17,7 @@
     "# NBVAL_IGNORE_OUTPUT\n",
     "import sys\n",
     "sys.path.insert(0, '../')\n",
-    "!{sys.executable} -m pip install -r ../requirements.txt\n"
+    "# !{sys.executable} -m pip install -r ../requirements.txt\n"
    ]
   },
   {
@@ -45,9 +45,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ptype = Ptype()\n",
-    "\n",
-    "column2ARFF = Column2ARFF(\"../models/\")"
+    "ptype = Ptype(model_folder='../models/')"
    ]
   },
   {
@@ -60,7 +58,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "column = 'Status'\n",
@@ -69,6 +69,16 @@
     "df_subsample"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = read_data(dataset_name='data_gov_10151_1', header=0)\n",
+    "df.head()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -100,10 +110,10 @@
     "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
     "                           types=ptype.types.items())\n",
     "\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
+    "# features = ptype.features[column]\n",
+    "# arff_type, arff_post = column2ARFF.get_arff(features)\n",
     "\n",
-    "plot_arff_type_posterior(arff_post)\n",
+    "# plot_arff_type_posterior(arff_post)\n",
     "\n",
     "plot_row_type_posterior(ptype.cols[column], t='anomaly')"
    ]
@@ -159,13 +169,13 @@
    "outputs": [],
    "source": [
     "ptype.run_inference(_data_frame=df_subsample)\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
+    "# features = ptype.features[column]\n",
+    "# arff_type, arff_post = column2ARFF.get_arff(features)\n",
     "\n",
     "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
     "                           types=ptype.types.items())\n",
     "\n",
-    "plot_arff_type_posterior(arff_post)\n",
+    "# plot_arff_type_posterior(arff_post)\n",
     "\n",
     "\n",
     "plot_row_type_posterior(ptype.cols[column], t='anomaly')"
diff --git a/notebooks/demo_column.ipynb b/notebooks/demo_column.ipynb
index c631127..b8a8ff1 100644
--- a/notebooks/demo_column.ipynb
+++ b/notebooks/demo_column.ipynb
@@ -34,6 +34,7 @@
     "plt.rcdefaults()\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "import pprint\n",
     "import seaborn as sns\n",
     "\n",
     "from ptype.Ptype import Ptype, Column2ARFF\n",
@@ -149,7 +150,7 @@
     "# check counts of missing data\n",
     "missing_data = df.isnull().sum()\n",
     "missing_data.sort_values(inplace=True, ascending=False)\n",
-    "missing_data.head()\n",
+    "display(missing_data.head())\n",
     "\n",
     "# drop rows\n",
     "n = df.shape[0]\n",
@@ -184,6 +185,17 @@
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pandas.io.json import build_table_schema\n",
+    "\n",
+    "build_table_schema(df)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -206,11 +218,18 @@
     "df = df[features+target]\n",
     "df.head()\n",
     "\n",
-    "ptype = Ptype()\n",
-    "ptype.run_inference(df)\n",
+    "ptype = Ptype(model_folder='../models/')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = ptype.fit_schema(df)\n",
     "\n",
-    "df = ptype.get_final_df()\n",
-    "df.head()"
+    "pprint.pprint(schema)"
    ]
   },
   {
@@ -219,7 +238,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.dtypes"
+    "df = ptype.transform_schema(df, schema)"
    ]
   },
   {
@@ -237,7 +256,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ptype.show_missing_values()"
+    "df.dtypes"
    ]
   },
   {
diff --git a/notebooks/demo_missing.ipynb b/notebooks/demo_missing.ipynb
index 0d15ab1..8c12afa 100644
--- a/notebooks/demo_missing.ipynb
+++ b/notebooks/demo_missing.ipynb
@@ -45,9 +45,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ptype = Ptype()\n",
-    "\n",
-    "column2ARFF = Column2ARFF(\"../models/\")"
+    "ptype = Ptype(model_folder='../models/')"
    ]
   },
   {
@@ -80,10 +78,10 @@
     "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
     "                           types=ptype.types.items())\n",
     "\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
+    "# features = ptype.features[column]\n",
+    "# arff_type, arff_post = column2ARFF.get_arff(features)\n",
     "\n",
-    "plot_arff_type_posterior(arff_post)\n",
+    "# plot_arff_type_posterior(arff_post)\n",
     "\n",
     "plot_row_type_posterior(ptype.cols[column], t='missing')"
    ]
@@ -139,13 +137,13 @@
    "outputs": [],
    "source": [
     "ptype.run_inference(_data_frame=df_subsample)\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
+    "# features = ptype.features[column]\n",
+    "# arff_type, arff_post = column2ARFF.get_arff(features)\n",
     "\n",
     "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
     "                           types=ptype.types.items())\n",
     "\n",
-    "plot_arff_type_posterior(arff_post)\n",
+    "# plot_arff_type_posterior(arff_post)\n",
     "\n",
     "plot_row_type_posterior(ptype.cols[column], t='missing')"
    ]
diff --git a/ptype/Ptype.py b/ptype/Ptype.py
index 0c4f01d..7807c0d 100644
--- a/ptype/Ptype.py
+++ b/ptype/Ptype.py
@@ -78,6 +78,15 @@ def show_results(self):
         print("\tfraction of missing:", round(missing / total, 2), "\n")
         print("\tfraction of anomalies:", round(anomalies / total, 2), "\n")
 
+    def get_ratio(self, status):
+        indices = [
+            i
+            for i, _ in enumerate(self.unique_vals)
+            if self.unique_vals_status[i] == status
+        ]
+        total = sum(self.unique_vals_counts)
+        return round(sum(self.unique_vals_counts[indices]) / total, 2)
+
     def get_normal_predictions(self):
         """Values identified as 'normal'."""
         return [
@@ -113,7 +122,7 @@ def replace_missing(self, v):
 
 
 class Ptype:
-    def __init__(self, _exp_num=0, _types=None):
+    def __init__(self, _exp_num=0, _types=None, model_folder="models/"):
         default_types = {
             1: "integer",
             2: "string",
@@ -134,6 +143,7 @@ def __init__(self, _exp_num=0, _types=None):
         self.features = {}
         self.verbose = False
         self.cols = {}  # column-indexed
+        self.column2ARFF = Column2ARFF(model_folder)
 
     def set_data(self, df):
         _dataset_name = "demo"
@@ -350,6 +360,82 @@ def get_final_df(self):
         df_final = self.update_dtypes(df_final)
         return df_final
 
+    def fit_schema(self, df):
+        """Generates a schema for a given data frame.
+
+        This function calculates the ptype outputs for a data frame and
+        store them in a schema.
+
+        Parameters
+        ----------
+        df: Pandas dataframe object.
+
+
+        Returns
+        -------
+        schema: Schema object.
+        """
+        self.run_inference(df)
+
+        # predicts the corresponding ARFF types
+        for col_name in self.cols:
+            features = self.features[col_name]
+            self.cols[col_name].arff_type = self.column2ARFF.get_arff_type(features)
+
+        ptype_pandas_mapping = {"integer": "Int64"}
+        schema = {}
+        for col_name in df:
+            col = self.cols[col_name]
+            t = col.predicted_type
+            arff_type = col.arff_type
+            normal_values = list(np.unique(col.get_normal_predictions()))
+            missing_values = list(np.unique(col.get_missing_data_predictions()))
+            anomalies = list(np.unique(col.get_anomaly_predictions()))
+            missingness_ratio = col.get_ratio(Status.MISSING)
+            anomalous_ratio = col.get_ratio(Status.ANOMALOUS)
+
+            schema[col_name] = {
+                "type": t,
+                "dtype": ptype_pandas_mapping[t],
+                "arff_type": arff_type,
+                "missing_values": missing_values,
+                "missingness_ratio": missingness_ratio,
+                "anomalies": anomalies,
+                "anomalous_ratio": anomalous_ratio,
+            }
+            if arff_type is "nominal":
+                schema[col_name]["categorical_values"] = normal_values
+        return schema
+
+    def transform_schema(self, df, schema):
+        """Transforms a data frame according to a schema.
+
+        This function modifies a data frame...
+
+        Parameters
+        ----------
+        df: Pandas dataframe object.
+        schema: Schema object.
+
+        Returns
+        -------
+        df_new: Transformed Pandas dataframe object.
+        """
+        df_new = df.copy()
+
+        # encodes missing data
+        df_new.apply(self.as_normal(schema), axis=0)
+
+        # change dtypes
+        df_new = self.update_dtypes(df_new)
+
+        return df_new
+
+    def as_normal(self, schema):
+        return lambda series: series.map(
+            lambda v: pd.NA if v in schema[series.name]["missing_values"] else v
+        )
+
     def detect_missing_anomalies(self, inferred_column_type):
         if inferred_column_type != "all identical":
             row_posteriors = self.model.p_z[:, np.argmax(self.model.p_t), :]

From 2d3aff62fe282bc07da93c6b5c33259645716979 Mon Sep 17 00:00:00 2001
From: Taha Yusuf Ceritli <t.y.ceritli@sms.ed.ac.uk>
Date: Thu, 10 Sep 2020 23:27:33 +0300
Subject: [PATCH 3/4] removed pprint

---
 notebooks/demo_column.ipynb | 6 +++---
 requirements.txt            | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/notebooks/demo_column.ipynb b/notebooks/demo_column.ipynb
index b8a8ff1..f61d4fd 100644
--- a/notebooks/demo_column.ipynb
+++ b/notebooks/demo_column.ipynb
@@ -34,7 +34,7 @@
     "plt.rcdefaults()\n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "import pprint\n",
+    "# import pprint\n",
     "import seaborn as sns\n",
     "\n",
     "from ptype.Ptype import Ptype, Column2ARFF\n",
@@ -228,8 +228,8 @@
    "outputs": [],
    "source": [
     "schema = ptype.fit_schema(df)\n",
-    "\n",
-    "pprint.pprint(schema)"
+    "schema\n",
+    "# pprint.pprint(schema)"
    ]
   },
   {
diff --git a/requirements.txt b/requirements.txt
index f215a87..f91976d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,6 @@ matplotlib
 nbval
 numpy
 pandas
-pprint
 scipy
 scikit-learn
 seaborn

From 468e6e1a962b525b9412e48ffd2f5912423c6a83 Mon Sep 17 00:00:00 2001
From: Taha Yusuf Ceritli <t.y.ceritli@sms.ed.ac.uk>
Date: Thu, 10 Sep 2020 23:38:03 +0300
Subject: [PATCH 4/4] deleted old notebook

---
 notebooks/demo.ipynb | 442 -------------------------------------------
 1 file changed, 442 deletions(-)
 delete mode 100644 notebooks/demo.ipynb

diff --git a/notebooks/demo.ipynb b/notebooks/demo.ipynb
deleted file mode 100644
index 8ca6ca2..0000000
--- a/notebooks/demo.ipynb
+++ /dev/null
@@ -1,442 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this notebook, we present various usecases to interact with ptype to handle:\n",
-    "\n",
-    "- incorrect column type predictions,\n",
-    "- incorrect missing type predictions.\n",
-    "- incorrect anomaly type predictions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Preamble to run notebook in context of source package.\n",
-    "# NBVAL_IGNORE_OUTPUT\n",
-    "import sys\n",
-    "sys.path.insert(0, '../')\n",
-    "!{sys.executable} -m pip install -r ../requirements.txt\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.core.display import display, HTML\n",
-    "\n",
-    "import matplotlib.pyplot as plt\n",
-    "%matplotlib inline\n",
-    "plt.rcdefaults()\n",
-    "\n",
-    "from ptype.Ptype import Ptype, Column2ARFF\n",
-    "from ptype.utils import evaluate_types\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from utils import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype = Ptype()\n",
-    "\n",
-    "column2ARFF = Column2ARFF(\"../models/\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 1. Incorrect Column Type Prediction\n",
-    "\n",
-    "## 1.a Incorrect Type Prediction"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = read_data(dataset_name=\"accident2016\", header=0)\n",
-    "\n",
-    "column = \"Time (24hr)\"\n",
-    "df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)\n",
-    "df_subsample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.run_inference(_data_frame=df_subsample)\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
-    "\n",
-    "plot_arff_type_posterior(arff_post)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.reclassify_column(column, 'date')\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())\n",
-    "\n",
-    "# do the same thing for arff type"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1.b Uniform posterior distribution"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = read_data(dataset_name='inspection_outcomes', header=0)\n",
-    "\n",
-    "column = 'Provision type'\n",
-    "df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)\n",
-    "df_subsample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.run_inference(_data_frame=df_subsample)\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
-    "\n",
-    "plot_arff_type_posterior(arff_post)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.reclassify_column(column, 'string')\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())\n",
-    "\n",
-    "# do the same thing for arff type\n",
-    "# add the character to the alphabet?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2. Incorrect Missing Data Prediction"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = read_data(dataset_name='auto')\n",
-    "column = 0\n",
-    "df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)\n",
-    "df_subsample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.run_inference(_data_frame=df_subsample)\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())\n",
-    "\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
-    "\n",
-    "plot_arff_type_posterior(arff_post)\n",
-    "\n",
-    "plot_row_type_posterior(ptype.cols[column], t='missing')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.cols[column].reclassify_normal(['-1'])\n",
-    "\n",
-    "plot_row_type_posterior(ptype.cols[column], t='missing')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 3. a Incorrect Anomaly Prediction"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "column = 'Status'\n",
-    "df = read_data(dataset_name='data_gov_10151_1', header=0)\n",
-    "df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 20)\n",
-    "df_subsample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "unique_values, counts = np.unique(\n",
-    "    [str(int_element) for int_element in df_subsample[column].tolist()],\n",
-    "    return_counts=True,\n",
-    ")\n",
-    "plot_bar(\n",
-    "    unique_values,\n",
-    "    counts,\n",
-    "    title=\"counts of the unique data values\",\n",
-    "    y_lim_max=None,\n",
-    "    xlabel=\"Unique Value\",\n",
-    "    ylabel=\"Counts\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.run_inference(_data_frame=df_subsample)\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())\n",
-    "\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
-    "\n",
-    "plot_arff_type_posterior(arff_post)\n",
-    "\n",
-    "plot_row_type_posterior(ptype.cols[column], t='anomaly')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.cols[column].reclassify_normal(['T', 'U'])\n",
-    "\n",
-    "plot_row_type_posterior(ptype.cols[column], t='anomaly')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 3.b Incorrect Anomaly Prediction"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = read_data(dataset_name=\"survey\", header=0)\n",
-    "\n",
-    "column = \"Gender\"\n",
-    "df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)\n",
-    "display(df_subsample)\n",
-    "\n",
-    "unique_values, counts = np.unique(\n",
-    "    [str(int_element) for int_element in df_subsample[column].tolist()],\n",
-    "    return_counts=True,\n",
-    ")\n",
-    "plot_bar(\n",
-    "    unique_values,\n",
-    "    counts,\n",
-    "    title=\"counts of the unique data values\",\n",
-    "    y_lim_max=None,\n",
-    "    xlabel=\"Unique Value\",\n",
-    "    ylabel=\"Counts\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.run_inference(_data_frame=df_subsample)\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())\n",
-    "\n",
-    "plot_arff_type_posterior(arff_post)\n",
-    "\n",
-    "\n",
-    "plot_row_type_posterior(ptype.cols[column], t='anomaly')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 4. Multiple Missing Data Encodings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = read_data(\"mass_6\", header=0)\n",
-    "\n",
-    "column = \"LRE Ages 3-5 - Full Incl #\"\n",
-    "df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=20)\n",
-    "display(df_subsample)\n",
-    "\n",
-    "unique_values, counts = np.unique(\n",
-    "    [str(int_element) for int_element in df_subsample[column].tolist()],\n",
-    "    return_counts=True,\n",
-    ")\n",
-    "plot_bar(\n",
-    "    unique_values,\n",
-    "    counts,\n",
-    "    title=\"counts of the unique data values\",\n",
-    "    y_lim_max=None,\n",
-    "    xlabel=\"Unique Value\",\n",
-    "    ylabel=\"Counts\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ptype.run_inference(_data_frame=df_subsample)\n",
-    "features = ptype.features[column]\n",
-    "arff_type, arff_post = column2ARFF.get_arff(features)\n",
-    "\n",
-    "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n",
-    "                           types=ptype.types.items())\n",
-    "\n",
-    "plot_arff_type_posterior(arff_post)\n",
-    "\n",
-    "plot_row_type_posterior(ptype.cols[column], t='missing')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "new_encoding = 'NA'\n",
-    "ptype.replace_missing(column, new_encoding)\n",
-    "\n",
-    "unique_values, counts = np.unique(\n",
-    "    [str(int_element) for int_element in ptype.model.data[column].tolist()],\n",
-    "    return_counts=True,\n",
-    ")\n",
-    "plot_row_type_posterior(ptype.cols[column], t='missing')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}