From e04f12071beed50e2c0af353469d3dfc40055ce2 Mon Sep 17 00:00:00 2001 From: Taha Yusuf Ceritli Date: Thu, 10 Sep 2020 23:04:58 +0300 Subject: [PATCH 1/4] added pretty print --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f91976d..f215a87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ matplotlib nbval numpy pandas +pprint scipy scikit-learn seaborn From 4230b88ed222d449d605c9f76fe54bc1a168172d Mon Sep 17 00:00:00 2001 From: Taha Yusuf Ceritli Date: Thu, 10 Sep 2020 23:05:41 +0300 Subject: [PATCH 2/4] added schema functions --- notebooks/demo_anomaly.ipynb | 32 ++++++++----- notebooks/demo_column.ipynb | 33 +++++++++++--- notebooks/demo_missing.ipynb | 16 +++---- ptype/Ptype.py | 88 +++++++++++++++++++++++++++++++++++- 4 files changed, 141 insertions(+), 28 deletions(-) diff --git a/notebooks/demo_anomaly.ipynb b/notebooks/demo_anomaly.ipynb index 2e90197..22d3ba2 100644 --- a/notebooks/demo_anomaly.ipynb +++ b/notebooks/demo_anomaly.ipynb @@ -17,7 +17,7 @@ "# NBVAL_IGNORE_OUTPUT\n", "import sys\n", "sys.path.insert(0, '../')\n", - "!{sys.executable} -m pip install -r ../requirements.txt\n" + "# !{sys.executable} -m pip install -r ../requirements.txt\n" ] }, { @@ -45,9 +45,7 @@ "metadata": {}, "outputs": [], "source": [ - "ptype = Ptype()\n", - "\n", - "column2ARFF = Column2ARFF(\"../models/\")" + "ptype = Ptype(model_folder='../models/')" ] }, { @@ -60,7 +58,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": false + }, "outputs": [], "source": [ "column = 'Status'\n", @@ -69,6 +69,16 @@ "df_subsample" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = read_data(dataset_name='data_gov_10151_1', header=0)\n", + "df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -100,10 +110,10 @@ "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", " types=ptype.types.items())\n", "\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", + "# features = ptype.features[column]\n", + "# arff_type, arff_post = column2ARFF.get_arff(features)\n", "\n", - "plot_arff_type_posterior(arff_post)\n", + "# plot_arff_type_posterior(arff_post)\n", "\n", "plot_row_type_posterior(ptype.cols[column], t='anomaly')" ] @@ -159,13 +169,13 @@ "outputs": [], "source": [ "ptype.run_inference(_data_frame=df_subsample)\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", + "# features = ptype.features[column]\n", + "# arff_type, arff_post = column2ARFF.get_arff(features)\n", "\n", "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", " types=ptype.types.items())\n", "\n", - "plot_arff_type_posterior(arff_post)\n", + "# plot_arff_type_posterior(arff_post)\n", "\n", "\n", "plot_row_type_posterior(ptype.cols[column], t='anomaly')" diff --git a/notebooks/demo_column.ipynb b/notebooks/demo_column.ipynb index c631127..b8a8ff1 100644 --- a/notebooks/demo_column.ipynb +++ b/notebooks/demo_column.ipynb @@ -34,6 +34,7 @@ "plt.rcdefaults()\n", "import numpy as np\n", "import pandas as pd\n", + "import pprint\n", "import seaborn as sns\n", "\n", "from ptype.Ptype import Ptype, Column2ARFF\n", @@ -149,7 +150,7 @@ "# check counts of missing data\n", "missing_data = df.isnull().sum()\n", "missing_data.sort_values(inplace=True, ascending=False)\n", - "missing_data.head()\n", + "display(missing_data.head())\n", "\n", "# drop rows\n", "n = df.shape[0]\n", @@ -184,6 +185,17 @@ "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas.io.json import build_table_schema\n", + "\n", + "build_table_schema(df)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -206,11 +218,18 @@ "df = df[features+target]\n", "df.head()\n", "\n", - "ptype = Ptype()\n", - "ptype.run_inference(df)\n", + "ptype = Ptype(model_folder='../models/')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "schema = ptype.fit_schema(df)\n", "\n", - "df = ptype.get_final_df()\n", - "df.head()" + "pprint.pprint(schema)" ] }, { @@ -219,7 +238,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.dtypes" + "df = ptype.transform_schema(df, schema)" ] }, { @@ -237,7 +256,7 @@ "metadata": {}, "outputs": [], "source": [ - "ptype.show_missing_values()" + "df.dtypes" ] }, { diff --git a/notebooks/demo_missing.ipynb b/notebooks/demo_missing.ipynb index 0d15ab1..8c12afa 100644 --- a/notebooks/demo_missing.ipynb +++ b/notebooks/demo_missing.ipynb @@ -45,9 +45,7 @@ "metadata": {}, "outputs": [], "source": [ - "ptype = Ptype()\n", - "\n", - "column2ARFF = Column2ARFF(\"../models/\")" + "ptype = Ptype(model_folder='../models/')" ] }, { @@ -80,10 +78,10 @@ "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", " types=ptype.types.items())\n", "\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", + "# features = ptype.features[column]\n", + "# arff_type, arff_post = column2ARFF.get_arff(features)\n", "\n", - "plot_arff_type_posterior(arff_post)\n", + "# plot_arff_type_posterior(arff_post)\n", "\n", "plot_row_type_posterior(ptype.cols[column], t='missing')" ] @@ -139,13 +137,13 @@ "outputs": [], "source": [ "ptype.run_inference(_data_frame=df_subsample)\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", + "# features = ptype.features[column]\n", + "# arff_type, arff_post = column2ARFF.get_arff(features)\n", "\n", "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", " types=ptype.types.items())\n", "\n", - "plot_arff_type_posterior(arff_post)\n", + "# plot_arff_type_posterior(arff_post)\n", "\n", "plot_row_type_posterior(ptype.cols[column], t='missing')" ] diff --git a/ptype/Ptype.py b/ptype/Ptype.py index 0c4f01d..7807c0d 100644 --- a/ptype/Ptype.py +++ b/ptype/Ptype.py @@ -78,6 +78,15 @@ def show_results(self): print("\tfraction of missing:", round(missing / total, 2), "\n") print("\tfraction of anomalies:", round(anomalies / total, 2), "\n") + def get_ratio(self, status): + indices = [ + i + for i, _ in enumerate(self.unique_vals) + if self.unique_vals_status[i] == status + ] + total = sum(self.unique_vals_counts) + return round(sum(self.unique_vals_counts[indices]) / total, 2) + def get_normal_predictions(self): """Values identified as 'normal'.""" return [ @@ -113,7 +122,7 @@ def replace_missing(self, v): class Ptype: - def __init__(self, _exp_num=0, _types=None): + def __init__(self, _exp_num=0, _types=None, model_folder="models/"): default_types = { 1: "integer", 2: "string", @@ -134,6 +143,7 @@ def __init__(self, _exp_num=0, _types=None): self.features = {} self.verbose = False self.cols = {} # column-indexed + self.column2ARFF = Column2ARFF(model_folder) def set_data(self, df): _dataset_name = "demo" @@ -350,6 +360,82 @@ def get_final_df(self): df_final = self.update_dtypes(df_final) return df_final + def fit_schema(self, df): + """Generates a schema for a given data frame. + + This function calculates the ptype outputs for a data frame and + store them in a schema. + + Parameters + ---------- + df: Pandas dataframe object. + + + Returns + ------- + schema: Schema object. + """ + self.run_inference(df) + + # predicts the corresponding ARFF types + for col_name in self.cols: + features = self.features[col_name] + self.cols[col_name].arff_type = self.column2ARFF.get_arff_type(features) + + ptype_pandas_mapping = {"integer": "Int64"} + schema = {} + for col_name in df: + col = self.cols[col_name] + t = col.predicted_type + arff_type = col.arff_type + normal_values = list(np.unique(col.get_normal_predictions())) + missing_values = list(np.unique(col.get_missing_data_predictions())) + anomalies = list(np.unique(col.get_anomaly_predictions())) + missingness_ratio = col.get_ratio(Status.MISSING) + anomalous_ratio = col.get_ratio(Status.ANOMALOUS) + + schema[col_name] = { + "type": t, + "dtype": ptype_pandas_mapping[t], + "arff_type": arff_type, + "missing_values": missing_values, + "missingness_ratio": missingness_ratio, + "anomalies": anomalies, + "anomalous_ratio": anomalous_ratio, + } + if arff_type is "nominal": + schema[col_name]["categorical_values"] = normal_values + return schema + + def transform_schema(self, df, schema): + """Transforms a data frame according to a schema. + + This function modifies a data frame... + + Parameters + ---------- + df: Pandas dataframe object. + schema: Schema object. + + Returns + ------- + df_new: Transformed Pandas dataframe object. + """ + df_new = df.copy() + + # encodes missing data + df_new.apply(self.as_normal(schema), axis=0) + + # change dtypes + df_new = self.update_dtypes(df_new) + + return df_new + + def as_normal(self, schema): + return lambda series: series.map( + lambda v: pd.NA if v in schema[series.name]["missing_values"] else v + ) + def detect_missing_anomalies(self, inferred_column_type): if inferred_column_type != "all identical": row_posteriors = self.model.p_z[:, np.argmax(self.model.p_t), :] From 2d3aff62fe282bc07da93c6b5c33259645716979 Mon Sep 17 00:00:00 2001 From: Taha Yusuf Ceritli Date: Thu, 10 Sep 2020 23:27:33 +0300 Subject: [PATCH 3/4] removed pprint --- notebooks/demo_column.ipynb | 6 +++--- requirements.txt | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/notebooks/demo_column.ipynb b/notebooks/demo_column.ipynb index b8a8ff1..f61d4fd 100644 --- a/notebooks/demo_column.ipynb +++ b/notebooks/demo_column.ipynb @@ -34,7 +34,7 @@ "plt.rcdefaults()\n", "import numpy as np\n", "import pandas as pd\n", - "import pprint\n", + "# import pprint\n", "import seaborn as sns\n", "\n", "from ptype.Ptype import Ptype, Column2ARFF\n", @@ -228,8 +228,8 @@ "outputs": [], "source": [ "schema = ptype.fit_schema(df)\n", - "\n", - "pprint.pprint(schema)" + "schema\n", + "# pprint.pprint(schema)" ] }, { diff --git a/requirements.txt b/requirements.txt index f215a87..f91976d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ matplotlib nbval numpy pandas -pprint scipy scikit-learn seaborn From 468e6e1a962b525b9412e48ffd2f5912423c6a83 Mon Sep 17 00:00:00 2001 From: Taha Yusuf Ceritli Date: Thu, 10 Sep 2020 23:38:03 +0300 Subject: [PATCH 4/4] deleted old notebook --- notebooks/demo.ipynb | 442 ------------------------------------------- 1 file changed, 442 deletions(-) delete mode 100644 notebooks/demo.ipynb diff --git a/notebooks/demo.ipynb b/notebooks/demo.ipynb deleted file mode 100644 index 8ca6ca2..0000000 --- a/notebooks/demo.ipynb +++ /dev/null @@ -1,442 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook, we present various usecases to interact with ptype to handle:\n", - "\n", - "- incorrect column type predictions,\n", - "- incorrect missing type predictions.\n", - "- incorrect anomaly type predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Preamble to run notebook in context of source package.\n", - "# NBVAL_IGNORE_OUTPUT\n", - "import sys\n", - "sys.path.insert(0, '../')\n", - "!{sys.executable} -m pip install -r ../requirements.txt\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.core.display import display, HTML\n", - "\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "plt.rcdefaults()\n", - "\n", - "from ptype.Ptype import Ptype, Column2ARFF\n", - "from ptype.utils import evaluate_types\n", - "import pandas as pd\n", - "import numpy as np\n", - "from utils import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype = Ptype()\n", - "\n", - "column2ARFF = Column2ARFF(\"../models/\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Incorrect Column Type Prediction\n", - "\n", - "## 1.a Incorrect Type Prediction" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = read_data(dataset_name=\"accident2016\", header=0)\n", - "\n", - "column = \"Time (24hr)\"\n", - "df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)\n", - "df_subsample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.run_inference(_data_frame=df_subsample)\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", - "\n", - "plot_arff_type_posterior(arff_post)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.reclassify_column(column, 'date')\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())\n", - "\n", - "# do the same thing for arff type" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1.b Uniform posterior distribution" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = read_data(dataset_name='inspection_outcomes', header=0)\n", - "\n", - "column = 'Provision type'\n", - "df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)\n", - "df_subsample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.run_inference(_data_frame=df_subsample)\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", - "\n", - "plot_arff_type_posterior(arff_post)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.reclassify_column(column, 'string')\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())\n", - "\n", - "# do the same thing for arff type\n", - "# add the character to the alphabet?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Incorrect Missing Data Prediction" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = read_data(dataset_name='auto')\n", - "column = 0\n", - "df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)\n", - "df_subsample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.run_inference(_data_frame=df_subsample)\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())\n", - "\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", - "\n", - "plot_arff_type_posterior(arff_post)\n", - "\n", - "plot_row_type_posterior(ptype.cols[column], t='missing')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.cols[column].reclassify_normal(['-1'])\n", - "\n", - "plot_row_type_posterior(ptype.cols[column], t='missing')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. a Incorrect Anomaly Prediction" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column = 'Status'\n", - "df = read_data(dataset_name='data_gov_10151_1', header=0)\n", - "df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 20)\n", - "df_subsample" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unique_values, counts = np.unique(\n", - " [str(int_element) for int_element in df_subsample[column].tolist()],\n", - " return_counts=True,\n", - ")\n", - "plot_bar(\n", - " unique_values,\n", - " counts,\n", - " title=\"counts of the unique data values\",\n", - " y_lim_max=None,\n", - " xlabel=\"Unique Value\",\n", - " ylabel=\"Counts\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.run_inference(_data_frame=df_subsample)\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())\n", - "\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", - "\n", - "plot_arff_type_posterior(arff_post)\n", - "\n", - "plot_row_type_posterior(ptype.cols[column], t='anomaly')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.cols[column].reclassify_normal(['T', 'U'])\n", - "\n", - "plot_row_type_posterior(ptype.cols[column], t='anomaly')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3.b Incorrect Anomaly Prediction" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = read_data(dataset_name=\"survey\", header=0)\n", - "\n", - "column = \"Gender\"\n", - "df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=10)\n", - "display(df_subsample)\n", - "\n", - "unique_values, counts = np.unique(\n", - " [str(int_element) for int_element in df_subsample[column].tolist()],\n", - " return_counts=True,\n", - ")\n", - "plot_bar(\n", - " unique_values,\n", - " counts,\n", - " title=\"counts of the unique data values\",\n", - " y_lim_max=None,\n", - " xlabel=\"Unique Value\",\n", - " ylabel=\"Counts\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.run_inference(_data_frame=df_subsample)\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())\n", - "\n", - "plot_arff_type_posterior(arff_post)\n", - "\n", - "\n", - "plot_row_type_posterior(ptype.cols[column], t='anomaly')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Multiple Missing Data Encodings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = read_data(\"mass_6\", header=0)\n", - "\n", - "column = \"LRE Ages 3-5 - Full Incl #\"\n", - "df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=20)\n", - "display(df_subsample)\n", - "\n", - "unique_values, counts = np.unique(\n", - " [str(int_element) for int_element in df_subsample[column].tolist()],\n", - " return_counts=True,\n", - ")\n", - "plot_bar(\n", - " unique_values,\n", - " counts,\n", - " title=\"counts of the unique data values\",\n", - " y_lim_max=None,\n", - " xlabel=\"Unique Value\",\n", - " ylabel=\"Counts\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptype.run_inference(_data_frame=df_subsample)\n", - "features = ptype.features[column]\n", - "arff_type, arff_post = column2ARFF.get_arff(features)\n", - "\n", - "plot_column_type_posterior(p_t=ptype.all_posteriors[\"demo\"][column], \n", - " types=ptype.types.items())\n", - "\n", - "plot_arff_type_posterior(arff_post)\n", - "\n", - "plot_row_type_posterior(ptype.cols[column], t='missing')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "new_encoding = 'NA'\n", - "ptype.replace_missing(column, new_encoding)\n", - "\n", - "unique_values, counts = np.unique(\n", - " [str(int_element) for int_element in ptype.model.data[column].tolist()],\n", - " return_counts=True,\n", - ")\n", - "plot_row_type_posterior(ptype.cols[column], t='missing')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}