From 0da3e9bbcfbcd298bc7b60f4196ca2ef7625a6b5 Mon Sep 17 00:00:00 2001 From: jnkien Date: Wed, 7 Jul 2021 11:32:23 +0200 Subject: [PATCH 1/3] Get optimal featuring insights with a simple LogisticModel --- notebooks/05-how_to_data_featuring.ipynb | 90 ++++++++++++++++ predictsignauxfaibles/preprocessors.py | 124 ++++++++++++++++++++++- 2 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 notebooks/05-how_to_data_featuring.ipynb diff --git a/notebooks/05-how_to_data_featuring.ipynb b/notebooks/05-how_to_data_featuring.ipynb new file mode 100644 index 0000000..086c633 --- /dev/null +++ b/notebooks/05-how_to_data_featuring.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "legislative-bridal", + "metadata": {}, + "source": [ + "# How to test various featurings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dutch-heavy", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from predictsignauxfaibles.preprocessors import get_featuring, apply_log, apply_sqrt" + ] + }, + { + "cell_type": "markdown", + "id": "worst-cambodia", + "metadata": {}, + "source": [ + "# 1. List of features\n", + "It is possible to create an array of variable names but also to use the file variables.json with the command as follows (by filling the `port`):\n", + "\n", + "```\n", + "curl --proxy socks5h://localhost: -OL https://raw.githubusercontent.com/signaux-faibles/opensignauxfaibles/master/js/reduce.algo2/docs/variables.json -o variables.json\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "altered-bicycle", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"output/data/variables.json\", 'r', encoding = 'utf-8') as f:\n", + " variables = json.load(f)\n", + "\n", + "features = list(set([x['name'] for x in variables]) - set(\"outcome\"))" + ] + }, + { + "cell_type": "markdown", + "id": "looking-reply", + "metadata": {}, + "source": [ + "# 2. Exploration of relevant featuring\n", + "For each variable taken separately, the function `get_featuring` provide the optimal transformation for a single variable to explain the variable `outcome` in a simple LogisticModel. These transformations still need to be tested in the SF model afterwards." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "compact-jurisdiction", + "metadata": {}, + "outputs": [], + "source": [ + "res = get_featuring(features, [apply_log, apply_sqrt])\n", + "res" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/predictsignauxfaibles/preprocessors.py b/predictsignauxfaibles/preprocessors.py index 014cf52..b3ddb58 100644 --- a/predictsignauxfaibles/preprocessors.py +++ b/predictsignauxfaibles/preprocessors.py @@ -1,7 +1,10 @@ from collections import namedtuple - +from typing import List, Callable +import math import numpy as np import pandas as pd +from sklearn.linear_model import LogisticRegression +from predictsignauxfaibles.data import SFDataset Preprocessor = namedtuple("Preprocessor", ["name", "function", "input", "output"]) @@ -58,3 +61,122 @@ def acoss_make_avg_delta_dette_par_effectif(data: pd.DataFrame): columns_to_drop = ["dette_par_effectif", "dette_par_effectif_past_3"] data.drop(columns=columns_to_drop, axis=1, inplace=True) return data + + +def apply_log(number: float) -> float: + """ + Apply the transformation number -> log(number + 1) + """ + return math.log(number + 1) + + +def apply_sqrt(number: float) -> float: + """ + Apply the transformation number -> sqrt(number) + """ + return math.sqrt(number) + + +def get_featuring( + features: List[str], funcs: List[Callable[[float], float]] +) -> List[dict]: + """ + For each feature in 'features' taken separately, provide the transformation among + 'funcs' to apply to improve the prediction performance with a LogisticRegression + (outcome = feature). For significant results, this should be bootstrap but it + is enough to test the transformations in the model used in the project in a + second step. + """ + dataset = SFDataset( + date_min="2015-01-01", + date_max="2020-06-30", + fields=["outcome"] + features, + sample_size=1000, + ) + dataset.fetch_data() + + res = [ + get_featuring_unitary(dataset.data, feat, f) for feat in features for f in funcs + ] + + res_as_df = pd.DataFrame(res) + res_as_df = res_as_df[res_as_df["is_relevant"]] + + return res_as_df.sort_values("score_after", ascending=False).drop_duplicates( + ["feature"] + ) + + +def get_featuring_unitary( + data: pd.DataFrame, feat: str, func: Callable[[float], float] +) -> dict: + """ + Apply the transformation 'func' to the feature 'feat', build a LogisticRegression + (outcome = feature) with and without the transformation and determine if it was + relevant. + """ + data = data[["outcome", feat]].copy() + + # handle missing value + data.dropna(inplace=True) + + if len(data[feat]) == 0: + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + + # handle non-numeric + if not all([type(x) in [int, float] for x in data[feat]]): + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + + # handle negative values + if any(data[feat] < 0): + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + + response = data.outcome + + # handle singular class in the response + if len(response.unique()) == 1: + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + feat_values = np.array(data[feat]).reshape(-1, 1) + + # Logistic without featuring + model = LogisticRegression() + model.fit(feat_values, response) + score_before = model.score(feat_values, response) + + # Logistic with featuring + feat_values = np.array(list(map(func, feat_values))).reshape(-1, 1) + model = LogisticRegression() + model.fit(feat_values, response) + score_after = model.score(feat_values, response) + + return { + "feature": feat, + "func": func.__name__, + "score_before": score_before, + "score_after": score_after, + "is_relevant": score_after > score_before, + } From 82c0d06e2c73004d7c9d423cefe3f758d7f0ea76 Mon Sep 17 00:00:00 2001 From: jnkien Date: Wed, 7 Jul 2021 15:56:40 +0200 Subject: [PATCH 2/3] Add transformers for the featuring and test a model with enhanced featuring --- models/default_with_featuring/README.md | 4 + models/default_with_featuring/model_conf.py | 204 ++++++++++++++++++++ notebooks/05-how_to_data_featuring.ipynb | 166 +++++++++++++++- predictsignauxfaibles/preprocessors.py | 125 +----------- predictsignauxfaibles/transformers.py | 167 ++++++++++++++++ 5 files changed, 535 insertions(+), 131 deletions(-) create mode 100644 models/default_with_featuring/README.md create mode 100644 models/default_with_featuring/model_conf.py create mode 100644 predictsignauxfaibles/transformers.py diff --git a/models/default_with_featuring/README.md b/models/default_with_featuring/README.md new file mode 100644 index 0000000..6238649 --- /dev/null +++ b/models/default_with_featuring/README.md @@ -0,0 +1,4 @@ +# Modèle "default_with_featuring" + +Ce modèle est identique au modèle "default" mais avec des préprocessing de variables personnalisés + diff --git a/models/default_with_featuring/model_conf.py b/models/default_with_featuring/model_conf.py new file mode 100644 index 0000000..b7febb5 --- /dev/null +++ b/models/default_with_featuring/model_conf.py @@ -0,0 +1,204 @@ +from datetime import datetime +import logging +import os +import subprocess + +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn_pandas import DataFrameMapper + +from predictsignauxfaibles.data import SFDataset, OversampledSFDataset +from predictsignauxfaibles.pipelines import DEFAULT_PIPELINE +from predictsignauxfaibles.utils import check_feature +from predictsignauxfaibles.transformers import SqrtTransformer,LogTransformer + +# ENV (default is "develop", can be set to "prod") +ENV = os.getenv("ENV", "develop") + + +# Model Information +MODEL_ID = "202103_logreg_full_with_featuring" +MODEL_RUN_DATE = datetime.today() +MODEL_GIT_SHA = str( + subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]), encoding="utf-8" +).rstrip("\n") + +# Variables disponibles en base : +# https://github.com/signaux-faibles/opensignauxfaibles/master/js/reduce.algo2/docs/variables.json +VARIABLES = [ + "financier_court_terme", + "interets", + "ca", + "equilibre_financier", + "endettement", + "degre_immo_corporelle", + "liquidite_reduite", + "poids_bfr_exploitation", + "productivite_capital_investi", + "rentabilite_economique", + "rentabilite_nette", + "cotisation", + "cotisation_moy12m", + "montant_part_ouvriere", + "montant_part_ouvriere_past_1", + "montant_part_ouvriere_past_12", + "montant_part_ouvriere_past_2", + "montant_part_ouvriere_past_3", + "montant_part_ouvriere_past_6", + "montant_part_patronale", + "montant_part_patronale_past_1", + "montant_part_patronale_past_12", + "montant_part_patronale_past_2", + "montant_part_patronale_past_3", + "montant_part_patronale_past_6", + "ratio_dette", + "ratio_dette_moy12m", + "effectif", + "apart_heures_consommees_cumulees", + "apart_heures_consommees", + "paydex_nb_jours", + "paydex_nb_jours_past_12", +] + +# ces variables sont toujours requêtées +VARIABLES += ["outcome", "periode", "siret", "siren", "time_til_outcome", "code_naf"] + +# Model-specific préprocessing +TRANSFO_PIPELINE = DEFAULT_PIPELINE + +# features +FEATURE_GROUPS = { + "sante_financiere": [ + "financier_court_terme", + "interets", + "ca", + "equilibre_financier", + "endettement", + "degre_immo_corporelle", + "liquidite_reduite", + "poids_bfr_exploitation", + "productivite_capital_investi", + "rentabilite_economique", + "rentabilite_nette", + ], + "activite_partielle": [ + "apart_heures_consommees_cumulees", + "apart_heures_consommees", + ], + "retards_paiement": [ + "paydex_group", + "paydex_yoy", + ], + "dette_urssaf": [ + "ratio_dette", + "avg_delta_dette_par_effectif", + ], + "miscellaneous": [], +} + +FEATURES = [feat for group_feats in FEATURE_GROUPS.values() for feat in group_feats] + +for feature in FEATURES: + if not check_feature(feature, VARIABLES, TRANSFO_PIPELINE): + raise ValueError( + f"Feature '{feature}' is not in VARIABLES nor created by the PIPELINE" + ) + +# model +TO_ONEHOT_ENCODE = ["paydex_group"] +# /!\ Onehot variables must be listed in the same order as in features, for explain function +TO_ONEHOT_ENCODE = [ + to_oh_enc for to_oh_enc in FEATURES if to_oh_enc in TO_ONEHOT_ENCODE +] + +TO_LOG = [ + 'time_til_default', + 'montant_part_patronale', + 'couverture_ca_besoin_fdr_past_2', + 'montant_part_ouvriere', + 'montant_part_patronale_past_1', + 'montant_part_patronale_past_6', + 'montant_part_patronale_past_12', + 'montant_part_ouvriere_past_3', + 'montant_part_ouvriere_past_12', + 'debit_entreprise', + 'apart_heures_autorisees' +] +TO_LOG = [x for x in TO_LOG if x in VARIABLES] + +TO_SQRT = [ + 'delai_montant_echeancier', + 'montant_part_patronale_past_2', + 'montant_part_patronale_past_3', + 'montant_part_ouvriere_past_1', + 'montant_part_ouvriere_past_6', + 'part_salaries', + 'part_salaries_past_1', + 'apart_heures_consommees_cumulees', + 'apart_entreprise', + 'apart_heures_consommees' +] +TO_SQRT = [x for x in TO_SQRT if x in VARIABLES] + +TO_SCALE = list(set(FEATURES) - set(TO_ONEHOT_ENCODE) - set(TO_LOG) - set(TO_SQRT)) + +mapper = DataFrameMapper( + [ + (TO_ONEHOT_ENCODE, [OneHotEncoder()]), + (TO_SCALE, [StandardScaler()]), + (TO_LOG, [LogTransformer(),StandardScaler()]), + (TO_SQRT, [SqrtTransformer(), StandardScaler()]), + ], +) + +MODEL_PIPELINE = Pipeline( + [ + ("transform_dataframe", mapper), + ("fit_model", LogisticRegression()) + ] +) + +# Train Dataset +TRAIN_FROM = "2016-01-01" +TRAIN_TO = "2018-06-30" +TRAIN_SAMPLE_SIZE = 1_000_000 if ENV == "prod" else 5_000 +TRAIN_OVERSAMPLING = 0.2 +TRAIN_DATASET = OversampledSFDataset( + TRAIN_OVERSAMPLING, + date_min=TRAIN_FROM, + date_max=TRAIN_TO, + fields=VARIABLES, + sample_size=TRAIN_SAMPLE_SIZE, +) + +# Test Dataset +TEST_FROM = "2018-07-01" +TEST_TO = "2018-10-31" +TEST_SAMPLE_SIZE = 250_000 if ENV == "prod" else 5_000 +TEST_DATASET = SFDataset( + date_min=TEST_FROM, + date_max=TEST_TO, + fields=VARIABLES, + sample_size=TEST_SAMPLE_SIZE, +) + +# Predict Dataset +PREDICT_ON = "2020-02-01" +PREDICT_SAMPLE_SIZE = 1_000_000_000 if ENV == "prod" else 5_000 +PREDICT_DATASET = SFDataset( + date_min=PREDICT_ON, + date_max=PREDICT_ON[:-2] + "28", + fields=VARIABLES, + sample_size=PREDICT_SAMPLE_SIZE, +) + +# Evaluation parameters +EVAL_BETA = 2 + +if __name__ == "__main__": + logging.getLogger().setLevel("INFO") + logging.info(f"ENV : {ENV}") + logging.info(f"Model {MODEL_ID}") + logging.info(f"Run on {MODEL_RUN_DATE}") + logging.info(f"Current commit: {MODEL_GIT_SHA}") diff --git a/notebooks/05-how_to_data_featuring.ipynb b/notebooks/05-how_to_data_featuring.ipynb index 086c633..0efa71b 100644 --- a/notebooks/05-how_to_data_featuring.ipynb +++ b/notebooks/05-how_to_data_featuring.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "legislative-bridal", + "id": "junior-booking", "metadata": {}, "source": [ "# How to test various featurings" @@ -11,17 +11,17 @@ { "cell_type": "code", "execution_count": null, - "id": "dutch-heavy", + "id": "fitting-greek", "metadata": {}, "outputs": [], "source": [ "import json\n", - "from predictsignauxfaibles.preprocessors import get_featuring, apply_log, apply_sqrt" + "from predictsignauxfaibles.transformers import print_featuring_for_model_conf, get_featuring, apply_log, apply_sqrt" ] }, { "cell_type": "markdown", - "id": "worst-cambodia", + "id": "studied-probe", "metadata": {}, "source": [ "# 1. List of features\n", @@ -35,7 +35,7 @@ { "cell_type": "code", "execution_count": null, - "id": "altered-bicycle", + "id": "parental-offset", "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "markdown", - "id": "looking-reply", + "id": "individual-concert", "metadata": {}, "source": [ "# 2. Exploration of relevant featuring\n", @@ -57,13 +57,165 @@ { "cell_type": "code", "execution_count": null, - "id": "compact-jurisdiction", + "id": "medium-design", "metadata": {}, "outputs": [], "source": [ "res = get_featuring(features, [apply_log, apply_sqrt])\n", "res" ] + }, + { + "cell_type": "markdown", + "id": "variable-relative", + "metadata": {}, + "source": [ + "# 3. Print the tranformation for each feature if relevant ready to be plugged in a model_conf.py." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fuzzy-briefs", + "metadata": {}, + "outputs": [], + "source": [ + "print_featuring_for_model_conf(res)" + ] + }, + { + "cell_type": "markdown", + "id": "worst-liquid", + "metadata": {}, + "source": [ + "# 4. (optional) Build/Export train and test datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ready-memory", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.getLogger().setLevel(logging.INFO)\n", + "from predictsignauxfaibles.utils import load_conf\n", + "\n", + "conf = load_conf(\"default\")\n", + "\n", + "train = conf.TRAIN_DATASET\n", + "train.sample_size = 1e4\n", + "\n", + "test = conf.TEST_DATASET\n", + "test.sample_size = 1e4\n", + "\n", + "savepath = \"output/data/featuring\"\n", + "\n", + "train.fetch_data().raise_if_empty()\n", + "test.fetch_data().raise_if_empty()\n", + "logging.info(\"Succesfully loaded Features data from MongoDB\")\n", + "\n", + "if savepath is not None:\n", + " train.data.to_csv(f\"{savepath}_train.csv\")\n", + " test.data.to_csv(f\"{savepath}_test.csv\")\n", + " logging.info(f\"Saved Features extract to {savepath}\")" + ] + }, + { + "cell_type": "markdown", + "id": "formal-architect", + "metadata": {}, + "source": [ + "# 5. Get data from csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "grave-treatment", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.getLogger().setLevel(logging.INFO)\n", + "\n", + "import pandas as pd\n", + "\n", + "from predictsignauxfaibles.config import IGNORE_NA\n", + "from predictsignauxfaibles.pipelines import run_pipeline\n", + "from predictsignauxfaibles.utils import load_conf\n", + "from predictsignauxfaibles.evaluate import evaluate\n", + "\n", + "csvpath = \"output/data/featuring\"\n", + "\n", + "train_filepath = f\"{csvpath}_train.csv\"\n", + "test_filepath = f\"{csvpath}_test.csv\"\n", + "\n", + "train_data = pd.read_csv(train_filepath)\n", + "logging.info(f\"Succesfully loaded train data from {train_filepath}\")\n", + "\n", + "test_data = pd.read_csv(test_filepath)\n", + "logging.info(f\"Succesfully loaded test data from {test_filepath}\")" + ] + }, + { + "cell_type": "markdown", + "id": "modern-printing", + "metadata": {}, + "source": [ + "# 6. Evaluate a model with and without featuring and compare performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "insured-expert", + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_to_compare(train_data, test_data, conf_name:str = \"default\"):\n", + " conf = load_conf(conf_name)\n", + " train = conf.TRAIN_DATASET\n", + " train.sample_size = 1e4\n", + " \n", + " test = conf.TEST_DATASET\n", + " test.sample_size = 1e4\n", + " \n", + " train.data = train_data\n", + " test.data = test_data\n", + " \n", + " train_siren_set = train.data[\"siren\"].unique().tolist()\n", + " test.remove_siren(train_siren_set)\n", + " \n", + " train.replace_missing_data().remove_na(ignore=IGNORE_NA)\n", + " train.data = run_pipeline(train.data, conf.TRANSFO_PIPELINE)\n", + " \n", + " test.replace_missing_data().remove_na(ignore=IGNORE_NA)\n", + " test.data = run_pipeline(test.data, conf.TRANSFO_PIPELINE)\n", + " \n", + " model_pp = conf.MODEL_PIPELINE\n", + " fit = model_pp.fit(train.data, train.data[\"outcome\"])\n", + " \n", + " eval_metrics = evaluate(fit, test, conf.EVAL_BETA)\n", + " return {\n", + " 'conf_name': conf_name,\n", + " 'aucpr': eval_metrics['aucpr'] \n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ordered-dallas", + "metadata": {}, + "outputs": [], + "source": [ + "perf_default = evaluate_to_compare(train_data, test_data, \"default\")\n", + "perf_default_with_featuring = evaluate_to_compare(train_data, test_data, \"default_with_featuring\")\n", + "print(perf_default)\n", + "print(perf_default_with_featuring)" + ] } ], "metadata": { diff --git a/predictsignauxfaibles/preprocessors.py b/predictsignauxfaibles/preprocessors.py index b3ddb58..44b7519 100644 --- a/predictsignauxfaibles/preprocessors.py +++ b/predictsignauxfaibles/preprocessors.py @@ -1,10 +1,6 @@ from collections import namedtuple -from typing import List, Callable -import math -import numpy as np import pandas as pd -from sklearn.linear_model import LogisticRegression -from predictsignauxfaibles.data import SFDataset +import numpy as np Preprocessor = namedtuple("Preprocessor", ["name", "function", "input", "output"]) @@ -61,122 +57,3 @@ def acoss_make_avg_delta_dette_par_effectif(data: pd.DataFrame): columns_to_drop = ["dette_par_effectif", "dette_par_effectif_past_3"] data.drop(columns=columns_to_drop, axis=1, inplace=True) return data - - -def apply_log(number: float) -> float: - """ - Apply the transformation number -> log(number + 1) - """ - return math.log(number + 1) - - -def apply_sqrt(number: float) -> float: - """ - Apply the transformation number -> sqrt(number) - """ - return math.sqrt(number) - - -def get_featuring( - features: List[str], funcs: List[Callable[[float], float]] -) -> List[dict]: - """ - For each feature in 'features' taken separately, provide the transformation among - 'funcs' to apply to improve the prediction performance with a LogisticRegression - (outcome = feature). For significant results, this should be bootstrap but it - is enough to test the transformations in the model used in the project in a - second step. - """ - dataset = SFDataset( - date_min="2015-01-01", - date_max="2020-06-30", - fields=["outcome"] + features, - sample_size=1000, - ) - dataset.fetch_data() - - res = [ - get_featuring_unitary(dataset.data, feat, f) for feat in features for f in funcs - ] - - res_as_df = pd.DataFrame(res) - res_as_df = res_as_df[res_as_df["is_relevant"]] - - return res_as_df.sort_values("score_after", ascending=False).drop_duplicates( - ["feature"] - ) - - -def get_featuring_unitary( - data: pd.DataFrame, feat: str, func: Callable[[float], float] -) -> dict: - """ - Apply the transformation 'func' to the feature 'feat', build a LogisticRegression - (outcome = feature) with and without the transformation and determine if it was - relevant. - """ - data = data[["outcome", feat]].copy() - - # handle missing value - data.dropna(inplace=True) - - if len(data[feat]) == 0: - return { - "feature": feat, - "func": func.__name__, - "score_before": np.nan, - "score_after": np.nan, - "is_relevant": False, - } - - # handle non-numeric - if not all([type(x) in [int, float] for x in data[feat]]): - return { - "feature": feat, - "func": func.__name__, - "score_before": np.nan, - "score_after": np.nan, - "is_relevant": False, - } - - # handle negative values - if any(data[feat] < 0): - return { - "feature": feat, - "func": func.__name__, - "score_before": np.nan, - "score_after": np.nan, - "is_relevant": False, - } - - response = data.outcome - - # handle singular class in the response - if len(response.unique()) == 1: - return { - "feature": feat, - "func": func.__name__, - "score_before": np.nan, - "score_after": np.nan, - "is_relevant": False, - } - feat_values = np.array(data[feat]).reshape(-1, 1) - - # Logistic without featuring - model = LogisticRegression() - model.fit(feat_values, response) - score_before = model.score(feat_values, response) - - # Logistic with featuring - feat_values = np.array(list(map(func, feat_values))).reshape(-1, 1) - model = LogisticRegression() - model.fit(feat_values, response) - score_after = model.score(feat_values, response) - - return { - "feature": feat, - "func": func.__name__, - "score_before": score_before, - "score_after": score_after, - "is_relevant": score_after > score_before, - } diff --git a/predictsignauxfaibles/transformers.py b/predictsignauxfaibles/transformers.py new file mode 100644 index 0000000..0de599f --- /dev/null +++ b/predictsignauxfaibles/transformers.py @@ -0,0 +1,167 @@ +import logging +import math +import re +from typing import List, Callable +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.linear_model import LogisticRegression +from predictsignauxfaibles.data import SFDataset + + +class SqrtTransformer(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + + def fit(self, X, y = None): + return self + + + def transform(self, X, y = None): + X_ = X.copy() + X_ = np.sqrt(X_) + return X_ + + +class LogTransformer(BaseEstimator, TransformerMixin): + def __init__(self): + pass + + + def fit(self, X, y = None): + return self + + + def transform(self, X, y = None): + X_ = X.copy() + X_ = np.log(X_+1) + return X_ + + +def apply_log(number: float) -> float: + """ + Apply the transformation number -> log(number + 1) + """ + return math.log(number + 1) + + +def apply_sqrt(number: float) -> float: + """ + Apply the transformation number -> sqrt(number) + """ + return math.sqrt(number) + + +def print_featuring_for_model_conf(featuring: pd.DataFrame) -> str: + """ + Print the tranformation for each feature to be plugged in a model_conf.py. + """ + featuring = featuring[['func', 'feature']].groupby('func').agg(list).reset_index() + featuring = {func:feat for (feat, func) in zip(featuring['feature'], featuring['func'])} + return featuring + + +def get_featuring( + features: List[str], funcs: List[Callable[[float], float]] +) -> pd.DataFrame: + """ + For each feature in 'features' taken separately, provide the transformation among + 'funcs' to apply to improve the prediction performance with a LogisticRegression + (outcome = feature). For significant results, this should be bootstrap but it + is enough to test the transformations in the model used in the project in a + second step. + """ + dataset = SFDataset( + date_min="2015-01-01", + date_max="2020-06-30", + fields=["outcome"] + features, + sample_size=1000, + ) + dataset.fetch_data() + + res = [ + get_featuring_unitary(dataset.data, feat, f) for feat in features for f in funcs + ] + + res_as_df = pd.DataFrame(res) + res_as_df = res_as_df[res_as_df["is_relevant"]] + + return res_as_df.sort_values("score_after", ascending=False).drop_duplicates( + ["feature"] + ) + + +def get_featuring_unitary( + data: pd.DataFrame, feat: str, func: Callable[[float], float] +) -> dict: + """ + Apply the transformation 'func' to the feature 'feat', build a LogisticRegression + (outcome = feature) with and without the transformation and determine if it was + relevant. + """ + data = data[["outcome", feat]].copy() + + # handle missing value + data.dropna(inplace=True) + + if len(data[feat]) == 0: + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + + # handle non-numeric + if not all([type(x) in [int, float] for x in data[feat]]): + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + + # handle negative values + if any(data[feat] < 0): + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + + response = data.outcome + + # handle singular class in the response + if len(response.unique()) == 1: + return { + "feature": feat, + "func": func.__name__, + "score_before": np.nan, + "score_after": np.nan, + "is_relevant": False, + } + feat_values = np.array(data[feat]).reshape(-1, 1) + + # Logistic without featuring + model = LogisticRegression() + model.fit(feat_values, response) + score_before = model.score(feat_values, response) + + # Logistic with featuring + feat_values = np.array(list(map(func, feat_values))).reshape(-1, 1) + model = LogisticRegression() + model.fit(feat_values, response) + score_after = model.score(feat_values, response) + + return { + "feature": feat, + "func": func.__name__, + "score_before": score_before, + "score_after": score_after, + "is_relevant": score_after > score_before, + } From ec7cc8889eb28b2fffac901e7b1757d378fc7ea5 Mon Sep 17 00:00:00 2001 From: jnkien Date: Wed, 7 Jul 2021 16:10:19 +0200 Subject: [PATCH 3/3] add pylint disablings + hook refactoring --- models/default_with_featuring/model_conf.py | 51 ++++++++++----------- predictsignauxfaibles/transformers.py | 36 +++++++-------- 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/models/default_with_featuring/model_conf.py b/models/default_with_featuring/model_conf.py index b7febb5..49be57a 100644 --- a/models/default_with_featuring/model_conf.py +++ b/models/default_with_featuring/model_conf.py @@ -11,7 +11,7 @@ from predictsignauxfaibles.data import SFDataset, OversampledSFDataset from predictsignauxfaibles.pipelines import DEFAULT_PIPELINE from predictsignauxfaibles.utils import check_feature -from predictsignauxfaibles.transformers import SqrtTransformer,LogTransformer +from predictsignauxfaibles.transformers import SqrtTransformer, LogTransformer # ENV (default is "develop", can be set to "prod") ENV = os.getenv("ENV", "develop") @@ -113,31 +113,31 @@ ] TO_LOG = [ - 'time_til_default', - 'montant_part_patronale', - 'couverture_ca_besoin_fdr_past_2', - 'montant_part_ouvriere', - 'montant_part_patronale_past_1', - 'montant_part_patronale_past_6', - 'montant_part_patronale_past_12', - 'montant_part_ouvriere_past_3', - 'montant_part_ouvriere_past_12', - 'debit_entreprise', - 'apart_heures_autorisees' + "time_til_default", + "montant_part_patronale", + "couverture_ca_besoin_fdr_past_2", + "montant_part_ouvriere", + "montant_part_patronale_past_1", + "montant_part_patronale_past_6", + "montant_part_patronale_past_12", + "montant_part_ouvriere_past_3", + "montant_part_ouvriere_past_12", + "debit_entreprise", + "apart_heures_autorisees", ] TO_LOG = [x for x in TO_LOG if x in VARIABLES] TO_SQRT = [ - 'delai_montant_echeancier', - 'montant_part_patronale_past_2', - 'montant_part_patronale_past_3', - 'montant_part_ouvriere_past_1', - 'montant_part_ouvriere_past_6', - 'part_salaries', - 'part_salaries_past_1', - 'apart_heures_consommees_cumulees', - 'apart_entreprise', - 'apart_heures_consommees' + "delai_montant_echeancier", + "montant_part_patronale_past_2", + "montant_part_patronale_past_3", + "montant_part_ouvriere_past_1", + "montant_part_ouvriere_past_6", + "part_salaries", + "part_salaries_past_1", + "apart_heures_consommees_cumulees", + "apart_entreprise", + "apart_heures_consommees", ] TO_SQRT = [x for x in TO_SQRT if x in VARIABLES] @@ -147,16 +147,13 @@ [ (TO_ONEHOT_ENCODE, [OneHotEncoder()]), (TO_SCALE, [StandardScaler()]), - (TO_LOG, [LogTransformer(),StandardScaler()]), + (TO_LOG, [LogTransformer(), StandardScaler()]), (TO_SQRT, [SqrtTransformer(), StandardScaler()]), ], ) MODEL_PIPELINE = Pipeline( - [ - ("transform_dataframe", mapper), - ("fit_model", LogisticRegression()) - ] + [("transform_dataframe", mapper), ("fit_model", LogisticRegression())] ) # Train Dataset diff --git a/predictsignauxfaibles/transformers.py b/predictsignauxfaibles/transformers.py index 0de599f..ac89cd3 100644 --- a/predictsignauxfaibles/transformers.py +++ b/predictsignauxfaibles/transformers.py @@ -1,6 +1,4 @@ -import logging import math -import re from typing import List, Callable import numpy as np import pandas as pd @@ -10,35 +8,33 @@ class SqrtTransformer(BaseEstimator, TransformerMixin): + # pylint: disable = E, C def __init__(self): pass - - - def fit(self, X, y = None): + + def fit(self): return self - - - def transform(self, X, y = None): + + def transform(X): X_ = X.copy() X_ = np.sqrt(X_) return X_ class LogTransformer(BaseEstimator, TransformerMixin): + # pylint: disable=E, C def __init__(self): pass - - - def fit(self, X, y = None): + + def fit(self): return self - - - def transform(self, X, y = None): + + def transform(X): X_ = X.copy() - X_ = np.log(X_+1) + X_ = np.log(X_ + 1) return X_ - + def apply_log(number: float) -> float: """ Apply the transformation number -> log(number + 1) @@ -52,13 +48,15 @@ def apply_sqrt(number: float) -> float: """ return math.sqrt(number) - + def print_featuring_for_model_conf(featuring: pd.DataFrame) -> str: """ Print the tranformation for each feature to be plugged in a model_conf.py. """ - featuring = featuring[['func', 'feature']].groupby('func').agg(list).reset_index() - featuring = {func:feat for (feat, func) in zip(featuring['feature'], featuring['func'])} + featuring = featuring[["func", "feature"]].groupby("func").agg(list).reset_index() + featuring = { + func: feat for (feat, func) in zip(featuring["feature"], featuring["func"]) + } return featuring