From 0da3e9bbcfbcd298bc7b60f4196ca2ef7625a6b5 Mon Sep 17 00:00:00 2001
From: jnkien <jnkien@gmail.com>
Date: Wed, 7 Jul 2021 11:32:23 +0200
Subject: [PATCH 1/3] Get optimal featuring insights with a simple
 LogisticModel

---
 notebooks/05-how_to_data_featuring.ipynb |  90 ++++++++++++++++
 predictsignauxfaibles/preprocessors.py   | 124 ++++++++++++++++++++++-
 2 files changed, 213 insertions(+), 1 deletion(-)
 create mode 100644 notebooks/05-how_to_data_featuring.ipynb
diff --git a/notebooks/05-how_to_data_featuring.ipynb b/notebooks/05-how_to_data_featuring.ipynb
new file mode 100644
index 0000000..086c633
--- /dev/null
+++ b/notebooks/05-how_to_data_featuring.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "legislative-bridal",
+   "metadata": {},
+   "source": [
+    "# How to test various featurings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dutch-heavy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from predictsignauxfaibles.preprocessors import get_featuring, apply_log, apply_sqrt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "worst-cambodia",
+   "metadata": {},
+   "source": [
+    "# 1. List of features\n",
+    "It is possible to create an array of variable names but also to use the file variables.json with the command as follows (by filling the `port`):\n",
+    "\n",
+    "```\n",
+    "curl --proxy socks5h://localhost:<port> -OL https://raw.githubusercontent.com/signaux-faibles/opensignauxfaibles/master/js/reduce.algo2/docs/variables.json -o variables.json\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "altered-bicycle",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"output/data/variables.json\", 'r', encoding = 'utf-8') as f:\n",
+    "    variables = json.load(f)\n",
+    "\n",
+    "features = list(set([x['name'] for x in variables]) - set(\"outcome\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "looking-reply",
+   "metadata": {},
+   "source": [
+    "# 2. Exploration of relevant featuring\n",
+    "For each variable taken separately, the function `get_featuring` provide the optimal transformation for a single variable to explain the variable `outcome` in a simple LogisticModel. These transformations still need to be tested in the SF model afterwards."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "compact-jurisdiction",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = get_featuring(features, [apply_log, apply_sqrt])\n",
+    "res"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/predictsignauxfaibles/preprocessors.py b/predictsignauxfaibles/preprocessors.py
index 014cf52..b3ddb58 100644
--- a/predictsignauxfaibles/preprocessors.py
+++ b/predictsignauxfaibles/preprocessors.py
@@ -1,7 +1,10 @@
 from collections import namedtuple
-
+from typing import List, Callable
+import math
 import numpy as np
 import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from predictsignauxfaibles.data import SFDataset
 
 Preprocessor = namedtuple("Preprocessor", ["name", "function", "input", "output"])
 
@@ -58,3 +61,122 @@ def acoss_make_avg_delta_dette_par_effectif(data: pd.DataFrame):
     columns_to_drop = ["dette_par_effectif", "dette_par_effectif_past_3"]
     data.drop(columns=columns_to_drop, axis=1, inplace=True)
     return data
+
+
+def apply_log(number: float) -> float:
+    """
+    Apply the transformation number -> log(number + 1)
+    """
+    return math.log(number + 1)
+
+
+def apply_sqrt(number: float) -> float:
+    """
+    Apply the transformation number -> sqrt(number)
+    """
+    return math.sqrt(number)
+
+
+def get_featuring(
+    features: List[str], funcs: List[Callable[[float], float]]
+) -> List[dict]:
+    """
+    For each feature in 'features' taken separately, provide the transformation among
+    'funcs' to apply to improve the prediction performance with a LogisticRegression
+    (outcome = feature). For significant results, this should be bootstrap but it
+    is enough to test the transformations in the model used in the project in a
+    second step.
+    """
+    dataset = SFDataset(
+        date_min="2015-01-01",
+        date_max="2020-06-30",
+        fields=["outcome"] + features,
+        sample_size=1000,
+    )
+    dataset.fetch_data()
+
+    res = [
+        get_featuring_unitary(dataset.data, feat, f) for feat in features for f in funcs
+    ]
+
+    res_as_df = pd.DataFrame(res)
+    res_as_df = res_as_df[res_as_df["is_relevant"]]
+
+    return res_as_df.sort_values("score_after", ascending=False).drop_duplicates(
+        ["feature"]
+    )
+
+
+def get_featuring_unitary(
+    data: pd.DataFrame, feat: str, func: Callable[[float], float]
+) -> dict:
+    """
+    Apply the transformation 'func' to the feature 'feat', build a LogisticRegression
+    (outcome = feature) with and without the transformation and determine if it was
+    relevant.
+    """
+    data = data[["outcome", feat]].copy()
+
+    # handle missing value
+    data.dropna(inplace=True)
+
+    if len(data[feat]) == 0:
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+
+    # handle non-numeric
+    if not all([type(x) in [int, float] for x in data[feat]]):
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+
+    # handle negative values
+    if any(data[feat] < 0):
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+
+    response = data.outcome
+
+    # handle singular class in the response
+    if len(response.unique()) == 1:
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+    feat_values = np.array(data[feat]).reshape(-1, 1)
+
+    # Logistic without featuring
+    model = LogisticRegression()
+    model.fit(feat_values, response)
+    score_before = model.score(feat_values, response)
+
+    # Logistic with featuring
+    feat_values = np.array(list(map(func, feat_values))).reshape(-1, 1)
+    model = LogisticRegression()
+    model.fit(feat_values, response)
+    score_after = model.score(feat_values, response)
+
+    return {
+        "feature": feat,
+        "func": func.__name__,
+        "score_before": score_before,
+        "score_after": score_after,
+        "is_relevant": score_after > score_before,
+    }

From 82c0d06e2c73004d7c9d423cefe3f758d7f0ea76 Mon Sep 17 00:00:00 2001
From: jnkien <jnkien@gmail.com>
Date: Wed, 7 Jul 2021 15:56:40 +0200
Subject: [PATCH 2/3] Add transformers for the featuring and test a model with
 enhanced featuring

---
 models/default_with_featuring/README.md     |   4 +
 models/default_with_featuring/model_conf.py | 204 ++++++++++++++++++++
 notebooks/05-how_to_data_featuring.ipynb    | 166 +++++++++++++++-
 predictsignauxfaibles/preprocessors.py      | 125 +-----------
 predictsignauxfaibles/transformers.py       | 167 ++++++++++++++++
 5 files changed, 535 insertions(+), 131 deletions(-)
 create mode 100644 models/default_with_featuring/README.md
 create mode 100644 models/default_with_featuring/model_conf.py
 create mode 100644 predictsignauxfaibles/transformers.py

diff --git a/models/default_with_featuring/README.md b/models/default_with_featuring/README.md
new file mode 100644
index 0000000..6238649
--- /dev/null
+++ b/models/default_with_featuring/README.md
@@ -0,0 +1,4 @@
+# Modèle "default_with_featuring"
+
+Ce modèle est identique au modèle "default" mais avec des préprocessing de variables personnalisés
+
diff --git a/models/default_with_featuring/model_conf.py b/models/default_with_featuring/model_conf.py
new file mode 100644
index 0000000..b7febb5
--- /dev/null
+++ b/models/default_with_featuring/model_conf.py
@@ -0,0 +1,204 @@
+from datetime import datetime
+import logging
+import os
+import subprocess
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn_pandas import DataFrameMapper
+
+from predictsignauxfaibles.data import SFDataset, OversampledSFDataset
+from predictsignauxfaibles.pipelines import DEFAULT_PIPELINE
+from predictsignauxfaibles.utils import check_feature
+from predictsignauxfaibles.transformers import SqrtTransformer,LogTransformer
+
+# ENV (default is "develop", can be set to "prod")
+ENV = os.getenv("ENV", "develop")
+
+
+# Model Information
+MODEL_ID = "202103_logreg_full_with_featuring"
+MODEL_RUN_DATE = datetime.today()
+MODEL_GIT_SHA = str(
+    subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]), encoding="utf-8"
+).rstrip("\n")
+
+# Variables disponibles en base :
+# https://github.com/signaux-faibles/opensignauxfaibles/master/js/reduce.algo2/docs/variables.json
+VARIABLES = [
+    "financier_court_terme",
+    "interets",
+    "ca",
+    "equilibre_financier",
+    "endettement",
+    "degre_immo_corporelle",
+    "liquidite_reduite",
+    "poids_bfr_exploitation",
+    "productivite_capital_investi",
+    "rentabilite_economique",
+    "rentabilite_nette",
+    "cotisation",
+    "cotisation_moy12m",
+    "montant_part_ouvriere",
+    "montant_part_ouvriere_past_1",
+    "montant_part_ouvriere_past_12",
+    "montant_part_ouvriere_past_2",
+    "montant_part_ouvriere_past_3",
+    "montant_part_ouvriere_past_6",
+    "montant_part_patronale",
+    "montant_part_patronale_past_1",
+    "montant_part_patronale_past_12",
+    "montant_part_patronale_past_2",
+    "montant_part_patronale_past_3",
+    "montant_part_patronale_past_6",
+    "ratio_dette",
+    "ratio_dette_moy12m",
+    "effectif",
+    "apart_heures_consommees_cumulees",
+    "apart_heures_consommees",
+    "paydex_nb_jours",
+    "paydex_nb_jours_past_12",
+]
+
+# ces variables sont toujours requêtées
+VARIABLES += ["outcome", "periode", "siret", "siren", "time_til_outcome", "code_naf"]
+
+# Model-specific préprocessing
+TRANSFO_PIPELINE = DEFAULT_PIPELINE
+
+# features
+FEATURE_GROUPS = {
+    "sante_financiere": [
+        "financier_court_terme",
+        "interets",
+        "ca",
+        "equilibre_financier",
+        "endettement",
+        "degre_immo_corporelle",
+        "liquidite_reduite",
+        "poids_bfr_exploitation",
+        "productivite_capital_investi",
+        "rentabilite_economique",
+        "rentabilite_nette",
+    ],
+    "activite_partielle": [
+        "apart_heures_consommees_cumulees",
+        "apart_heures_consommees",
+    ],
+    "retards_paiement": [
+        "paydex_group",
+        "paydex_yoy",
+    ],
+    "dette_urssaf": [
+        "ratio_dette",
+        "avg_delta_dette_par_effectif",
+    ],
+    "miscellaneous": [],
+}
+
+FEATURES = [feat for group_feats in FEATURE_GROUPS.values() for feat in group_feats]
+
+for feature in FEATURES:
+    if not check_feature(feature, VARIABLES, TRANSFO_PIPELINE):
+        raise ValueError(
+            f"Feature '{feature}' is not in VARIABLES nor created by the PIPELINE"
+        )
+
+# model
+TO_ONEHOT_ENCODE = ["paydex_group"]
+# /!\ Onehot variables must be listed in the same order as in features, for explain function
+TO_ONEHOT_ENCODE = [
+    to_oh_enc for to_oh_enc in FEATURES if to_oh_enc in TO_ONEHOT_ENCODE
+]
+
+TO_LOG = [
+    'time_til_default',
+    'montant_part_patronale',
+    'couverture_ca_besoin_fdr_past_2',
+    'montant_part_ouvriere',
+    'montant_part_patronale_past_1',
+    'montant_part_patronale_past_6',
+    'montant_part_patronale_past_12',
+    'montant_part_ouvriere_past_3',
+    'montant_part_ouvriere_past_12',
+    'debit_entreprise',
+    'apart_heures_autorisees'
+]
+TO_LOG = [x for x in TO_LOG if x in VARIABLES]
+
+TO_SQRT = [
+    'delai_montant_echeancier',
+    'montant_part_patronale_past_2',
+    'montant_part_patronale_past_3',
+    'montant_part_ouvriere_past_1',
+    'montant_part_ouvriere_past_6',
+    'part_salaries',
+    'part_salaries_past_1',
+    'apart_heures_consommees_cumulees',
+    'apart_entreprise',
+    'apart_heures_consommees'
+]
+TO_SQRT = [x for x in TO_SQRT if x in VARIABLES]
+
+TO_SCALE = list(set(FEATURES) - set(TO_ONEHOT_ENCODE) - set(TO_LOG) - set(TO_SQRT))
+
+mapper = DataFrameMapper(
+    [
+        (TO_ONEHOT_ENCODE, [OneHotEncoder()]),
+        (TO_SCALE, [StandardScaler()]),
+        (TO_LOG, [LogTransformer(),StandardScaler()]),
+        (TO_SQRT, [SqrtTransformer(), StandardScaler()]),
+    ],
+)
+
+MODEL_PIPELINE = Pipeline(
+    [
+        ("transform_dataframe", mapper),
+        ("fit_model", LogisticRegression())
+    ]
+)
+
+# Train Dataset
+TRAIN_FROM = "2016-01-01"
+TRAIN_TO = "2018-06-30"
+TRAIN_SAMPLE_SIZE = 1_000_000 if ENV == "prod" else 5_000
+TRAIN_OVERSAMPLING = 0.2
+TRAIN_DATASET = OversampledSFDataset(
+    TRAIN_OVERSAMPLING,
+    date_min=TRAIN_FROM,
+    date_max=TRAIN_TO,
+    fields=VARIABLES,
+    sample_size=TRAIN_SAMPLE_SIZE,
+)
+
+# Test Dataset
+TEST_FROM = "2018-07-01"
+TEST_TO = "2018-10-31"
+TEST_SAMPLE_SIZE = 250_000 if ENV == "prod" else 5_000
+TEST_DATASET = SFDataset(
+    date_min=TEST_FROM,
+    date_max=TEST_TO,
+    fields=VARIABLES,
+    sample_size=TEST_SAMPLE_SIZE,
+)
+
+# Predict Dataset
+PREDICT_ON = "2020-02-01"
+PREDICT_SAMPLE_SIZE = 1_000_000_000 if ENV == "prod" else 5_000
+PREDICT_DATASET = SFDataset(
+    date_min=PREDICT_ON,
+    date_max=PREDICT_ON[:-2] + "28",
+    fields=VARIABLES,
+    sample_size=PREDICT_SAMPLE_SIZE,
+)
+
+# Evaluation parameters
+EVAL_BETA = 2
+
+if __name__ == "__main__":
+    logging.getLogger().setLevel("INFO")
+    logging.info(f"ENV : {ENV}")
+    logging.info(f"Model {MODEL_ID}")
+    logging.info(f"Run on {MODEL_RUN_DATE}")
+    logging.info(f"Current commit: {MODEL_GIT_SHA}")
diff --git a/notebooks/05-how_to_data_featuring.ipynb b/notebooks/05-how_to_data_featuring.ipynb
index 086c633..0efa71b 100644
--- a/notebooks/05-how_to_data_featuring.ipynb
+++ b/notebooks/05-how_to_data_featuring.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "legislative-bridal",
+   "id": "junior-booking",
    "metadata": {},
    "source": [
     "# How to test various featurings"
@@ -11,17 +11,17 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "dutch-heavy",
+   "id": "fitting-greek",
    "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
-    "from predictsignauxfaibles.preprocessors import get_featuring, apply_log, apply_sqrt"
+    "from predictsignauxfaibles.transformers import print_featuring_for_model_conf, get_featuring, apply_log, apply_sqrt"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "worst-cambodia",
+   "id": "studied-probe",
    "metadata": {},
    "source": [
     "# 1. List of features\n",
@@ -35,7 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "altered-bicycle",
+   "id": "parental-offset",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "looking-reply",
+   "id": "individual-concert",
    "metadata": {},
    "source": [
     "# 2. Exploration of relevant featuring\n",
@@ -57,13 +57,165 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "compact-jurisdiction",
+   "id": "medium-design",
    "metadata": {},
    "outputs": [],
    "source": [
     "res = get_featuring(features, [apply_log, apply_sqrt])\n",
     "res"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "variable-relative",
+   "metadata": {},
+   "source": [
+    "# 3. Print the tranformation for each feature if relevant ready to be plugged in a model_conf.py."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fuzzy-briefs",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_featuring_for_model_conf(res)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "worst-liquid",
+   "metadata": {},
+   "source": [
+    "# 4. (optional) Build/Export train and test datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ready-memory",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logging.getLogger().setLevel(logging.INFO)\n",
+    "from predictsignauxfaibles.utils import load_conf\n",
+    "\n",
+    "conf = load_conf(\"default\")\n",
+    "\n",
+    "train = conf.TRAIN_DATASET\n",
+    "train.sample_size = 1e4\n",
+    "\n",
+    "test = conf.TEST_DATASET\n",
+    "test.sample_size = 1e4\n",
+    "\n",
+    "savepath = \"output/data/featuring\"\n",
+    "\n",
+    "train.fetch_data().raise_if_empty()\n",
+    "test.fetch_data().raise_if_empty()\n",
+    "logging.info(\"Succesfully loaded Features data from MongoDB\")\n",
+    "\n",
+    "if savepath is not None:\n",
+    "    train.data.to_csv(f\"{savepath}_train.csv\")\n",
+    "    test.data.to_csv(f\"{savepath}_test.csv\")\n",
+    "    logging.info(f\"Saved Features extract to {savepath}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "formal-architect",
+   "metadata": {},
+   "source": [
+    "# 5. Get data from csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "grave-treatment",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logging.getLogger().setLevel(logging.INFO)\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from predictsignauxfaibles.config import IGNORE_NA\n",
+    "from predictsignauxfaibles.pipelines import run_pipeline\n",
+    "from predictsignauxfaibles.utils import load_conf\n",
+    "from predictsignauxfaibles.evaluate import evaluate\n",
+    "\n",
+    "csvpath = \"output/data/featuring\"\n",
+    "\n",
+    "train_filepath = f\"{csvpath}_train.csv\"\n",
+    "test_filepath = f\"{csvpath}_test.csv\"\n",
+    "\n",
+    "train_data = pd.read_csv(train_filepath)\n",
+    "logging.info(f\"Succesfully loaded train data from {train_filepath}\")\n",
+    "\n",
+    "test_data = pd.read_csv(test_filepath)\n",
+    "logging.info(f\"Succesfully loaded test data from {test_filepath}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "modern-printing",
+   "metadata": {},
+   "source": [
+    "# 6. Evaluate a model with and without featuring and compare performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "insured-expert",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_to_compare(train_data, test_data, conf_name:str = \"default\"):\n",
+    "    conf = load_conf(conf_name)\n",
+    "    train = conf.TRAIN_DATASET\n",
+    "    train.sample_size = 1e4\n",
+    "    \n",
+    "    test = conf.TEST_DATASET\n",
+    "    test.sample_size = 1e4\n",
+    "    \n",
+    "    train.data = train_data\n",
+    "    test.data = test_data\n",
+    "    \n",
+    "    train_siren_set = train.data[\"siren\"].unique().tolist()\n",
+    "    test.remove_siren(train_siren_set)\n",
+    "    \n",
+    "    train.replace_missing_data().remove_na(ignore=IGNORE_NA)\n",
+    "    train.data = run_pipeline(train.data, conf.TRANSFO_PIPELINE)\n",
+    "    \n",
+    "    test.replace_missing_data().remove_na(ignore=IGNORE_NA)\n",
+    "    test.data = run_pipeline(test.data, conf.TRANSFO_PIPELINE)\n",
+    "    \n",
+    "    model_pp = conf.MODEL_PIPELINE\n",
+    "    fit = model_pp.fit(train.data, train.data[\"outcome\"])\n",
+    "    \n",
+    "    eval_metrics = evaluate(fit, test, conf.EVAL_BETA)\n",
+    "    return {\n",
+    "        'conf_name': conf_name,\n",
+    "        'aucpr': eval_metrics['aucpr']   \n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ordered-dallas",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "perf_default = evaluate_to_compare(train_data, test_data, \"default\")\n",
+    "perf_default_with_featuring = evaluate_to_compare(train_data, test_data, \"default_with_featuring\")\n",
+    "print(perf_default)\n",
+    "print(perf_default_with_featuring)"
+   ]
   }
  ],
  "metadata": {
diff --git a/predictsignauxfaibles/preprocessors.py b/predictsignauxfaibles/preprocessors.py
index b3ddb58..44b7519 100644
--- a/predictsignauxfaibles/preprocessors.py
+++ b/predictsignauxfaibles/preprocessors.py
@@ -1,10 +1,6 @@
 from collections import namedtuple
-from typing import List, Callable
-import math
-import numpy as np
 import pandas as pd
-from sklearn.linear_model import LogisticRegression
-from predictsignauxfaibles.data import SFDataset
+import numpy as np
 
 Preprocessor = namedtuple("Preprocessor", ["name", "function", "input", "output"])
 
@@ -61,122 +57,3 @@ def acoss_make_avg_delta_dette_par_effectif(data: pd.DataFrame):
     columns_to_drop = ["dette_par_effectif", "dette_par_effectif_past_3"]
     data.drop(columns=columns_to_drop, axis=1, inplace=True)
     return data
-
-
-def apply_log(number: float) -> float:
-    """
-    Apply the transformation number -> log(number + 1)
-    """
-    return math.log(number + 1)
-
-
-def apply_sqrt(number: float) -> float:
-    """
-    Apply the transformation number -> sqrt(number)
-    """
-    return math.sqrt(number)
-
-
-def get_featuring(
-    features: List[str], funcs: List[Callable[[float], float]]
-) -> List[dict]:
-    """
-    For each feature in 'features' taken separately, provide the transformation among
-    'funcs' to apply to improve the prediction performance with a LogisticRegression
-    (outcome = feature). For significant results, this should be bootstrap but it
-    is enough to test the transformations in the model used in the project in a
-    second step.
-    """
-    dataset = SFDataset(
-        date_min="2015-01-01",
-        date_max="2020-06-30",
-        fields=["outcome"] + features,
-        sample_size=1000,
-    )
-    dataset.fetch_data()
-
-    res = [
-        get_featuring_unitary(dataset.data, feat, f) for feat in features for f in funcs
-    ]
-
-    res_as_df = pd.DataFrame(res)
-    res_as_df = res_as_df[res_as_df["is_relevant"]]
-
-    return res_as_df.sort_values("score_after", ascending=False).drop_duplicates(
-        ["feature"]
-    )
-
-
-def get_featuring_unitary(
-    data: pd.DataFrame, feat: str, func: Callable[[float], float]
-) -> dict:
-    """
-    Apply the transformation 'func' to the feature 'feat', build a LogisticRegression
-    (outcome = feature) with and without the transformation and determine if it was
-    relevant.
-    """
-    data = data[["outcome", feat]].copy()
-
-    # handle missing value
-    data.dropna(inplace=True)
-
-    if len(data[feat]) == 0:
-        return {
-            "feature": feat,
-            "func": func.__name__,
-            "score_before": np.nan,
-            "score_after": np.nan,
-            "is_relevant": False,
-        }
-
-    # handle non-numeric
-    if not all([type(x) in [int, float] for x in data[feat]]):
-        return {
-            "feature": feat,
-            "func": func.__name__,
-            "score_before": np.nan,
-            "score_after": np.nan,
-            "is_relevant": False,
-        }
-
-    # handle negative values
-    if any(data[feat] < 0):
-        return {
-            "feature": feat,
-            "func": func.__name__,
-            "score_before": np.nan,
-            "score_after": np.nan,
-            "is_relevant": False,
-        }
-
-    response = data.outcome
-
-    # handle singular class in the response
-    if len(response.unique()) == 1:
-        return {
-            "feature": feat,
-            "func": func.__name__,
-            "score_before": np.nan,
-            "score_after": np.nan,
-            "is_relevant": False,
-        }
-    feat_values = np.array(data[feat]).reshape(-1, 1)
-
-    # Logistic without featuring
-    model = LogisticRegression()
-    model.fit(feat_values, response)
-    score_before = model.score(feat_values, response)
-
-    # Logistic with featuring
-    feat_values = np.array(list(map(func, feat_values))).reshape(-1, 1)
-    model = LogisticRegression()
-    model.fit(feat_values, response)
-    score_after = model.score(feat_values, response)
-
-    return {
-        "feature": feat,
-        "func": func.__name__,
-        "score_before": score_before,
-        "score_after": score_after,
-        "is_relevant": score_after > score_before,
-    }
diff --git a/predictsignauxfaibles/transformers.py b/predictsignauxfaibles/transformers.py
new file mode 100644
index 0000000..0de599f
--- /dev/null
+++ b/predictsignauxfaibles/transformers.py
@@ -0,0 +1,167 @@
+import logging
+import math
+import re
+from typing import List, Callable
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.linear_model import LogisticRegression
+from predictsignauxfaibles.data import SFDataset
+
+
+class SqrtTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    
+    
+    def fit(self, X, y = None):
+        return self
+    
+    
+    def transform(self, X, y = None):
+        X_ = X.copy()
+        X_ = np.sqrt(X_)
+        return X_
+
+
+class LogTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    
+    
+    def fit(self, X, y = None):
+        return self
+    
+    
+    def transform(self, X, y = None):
+        X_ = X.copy()
+        X_ = np.log(X_+1)
+        return X_
+
+    
+def apply_log(number: float) -> float:
+    """
+    Apply the transformation number -> log(number + 1)
+    """
+    return math.log(number + 1)
+
+
+def apply_sqrt(number: float) -> float:
+    """
+    Apply the transformation number -> sqrt(number)
+    """
+    return math.sqrt(number)
+
+    
+def print_featuring_for_model_conf(featuring: pd.DataFrame) -> str:
+    """
+    Print the tranformation for each feature to be plugged in a model_conf.py.
+    """
+    featuring = featuring[['func', 'feature']].groupby('func').agg(list).reset_index()
+    featuring = {func:feat for (feat, func) in zip(featuring['feature'], featuring['func'])}
+    return featuring
+
+
+def get_featuring(
+    features: List[str], funcs: List[Callable[[float], float]]
+) -> pd.DataFrame:
+    """
+    For each feature in 'features' taken separately, provide the transformation among
+    'funcs' to apply to improve the prediction performance with a LogisticRegression
+    (outcome = feature). For significant results, this should be bootstrap but it
+    is enough to test the transformations in the model used in the project in a
+    second step.
+    """
+    dataset = SFDataset(
+        date_min="2015-01-01",
+        date_max="2020-06-30",
+        fields=["outcome"] + features,
+        sample_size=1000,
+    )
+    dataset.fetch_data()
+
+    res = [
+        get_featuring_unitary(dataset.data, feat, f) for feat in features for f in funcs
+    ]
+
+    res_as_df = pd.DataFrame(res)
+    res_as_df = res_as_df[res_as_df["is_relevant"]]
+
+    return res_as_df.sort_values("score_after", ascending=False).drop_duplicates(
+        ["feature"]
+    )
+
+
+def get_featuring_unitary(
+    data: pd.DataFrame, feat: str, func: Callable[[float], float]
+) -> dict:
+    """
+    Apply the transformation 'func' to the feature 'feat', build a LogisticRegression
+    (outcome = feature) with and without the transformation and determine if it was
+    relevant.
+    """
+    data = data[["outcome", feat]].copy()
+
+    # handle missing value
+    data.dropna(inplace=True)
+
+    if len(data[feat]) == 0:
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+
+    # handle non-numeric
+    if not all([type(x) in [int, float] for x in data[feat]]):
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+
+    # handle negative values
+    if any(data[feat] < 0):
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+
+    response = data.outcome
+
+    # handle singular class in the response
+    if len(response.unique()) == 1:
+        return {
+            "feature": feat,
+            "func": func.__name__,
+            "score_before": np.nan,
+            "score_after": np.nan,
+            "is_relevant": False,
+        }
+    feat_values = np.array(data[feat]).reshape(-1, 1)
+
+    # Logistic without featuring
+    model = LogisticRegression()
+    model.fit(feat_values, response)
+    score_before = model.score(feat_values, response)
+
+    # Logistic with featuring
+    feat_values = np.array(list(map(func, feat_values))).reshape(-1, 1)
+    model = LogisticRegression()
+    model.fit(feat_values, response)
+    score_after = model.score(feat_values, response)
+
+    return {
+        "feature": feat,
+        "func": func.__name__,
+        "score_before": score_before,
+        "score_after": score_after,
+        "is_relevant": score_after > score_before,
+    }

From ec7cc8889eb28b2fffac901e7b1757d378fc7ea5 Mon Sep 17 00:00:00 2001
From: jnkien <jnkien@gmail.com>
Date: Wed, 7 Jul 2021 16:10:19 +0200
Subject: [PATCH 3/3] add pylint disablings + hook refactoring

---
 models/default_with_featuring/model_conf.py | 51 ++++++++++-----------
 predictsignauxfaibles/transformers.py       | 36 +++++++--------
 2 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/models/default_with_featuring/model_conf.py b/models/default_with_featuring/model_conf.py
index b7febb5..49be57a 100644
--- a/models/default_with_featuring/model_conf.py
+++ b/models/default_with_featuring/model_conf.py
@@ -11,7 +11,7 @@
 from predictsignauxfaibles.data import SFDataset, OversampledSFDataset
 from predictsignauxfaibles.pipelines import DEFAULT_PIPELINE
 from predictsignauxfaibles.utils import check_feature
-from predictsignauxfaibles.transformers import SqrtTransformer,LogTransformer
+from predictsignauxfaibles.transformers import SqrtTransformer, LogTransformer
 
 # ENV (default is "develop", can be set to "prod")
 ENV = os.getenv("ENV", "develop")
@@ -113,31 +113,31 @@
 ]
 
 TO_LOG = [
-    'time_til_default',
-    'montant_part_patronale',
-    'couverture_ca_besoin_fdr_past_2',
-    'montant_part_ouvriere',
-    'montant_part_patronale_past_1',
-    'montant_part_patronale_past_6',
-    'montant_part_patronale_past_12',
-    'montant_part_ouvriere_past_3',
-    'montant_part_ouvriere_past_12',
-    'debit_entreprise',
-    'apart_heures_autorisees'
+    "time_til_default",
+    "montant_part_patronale",
+    "couverture_ca_besoin_fdr_past_2",
+    "montant_part_ouvriere",
+    "montant_part_patronale_past_1",
+    "montant_part_patronale_past_6",
+    "montant_part_patronale_past_12",
+    "montant_part_ouvriere_past_3",
+    "montant_part_ouvriere_past_12",
+    "debit_entreprise",
+    "apart_heures_autorisees",
 ]
 TO_LOG = [x for x in TO_LOG if x in VARIABLES]
 
 TO_SQRT = [
-    'delai_montant_echeancier',
-    'montant_part_patronale_past_2',
-    'montant_part_patronale_past_3',
-    'montant_part_ouvriere_past_1',
-    'montant_part_ouvriere_past_6',
-    'part_salaries',
-    'part_salaries_past_1',
-    'apart_heures_consommees_cumulees',
-    'apart_entreprise',
-    'apart_heures_consommees'
+    "delai_montant_echeancier",
+    "montant_part_patronale_past_2",
+    "montant_part_patronale_past_3",
+    "montant_part_ouvriere_past_1",
+    "montant_part_ouvriere_past_6",
+    "part_salaries",
+    "part_salaries_past_1",
+    "apart_heures_consommees_cumulees",
+    "apart_entreprise",
+    "apart_heures_consommees",
 ]
 TO_SQRT = [x for x in TO_SQRT if x in VARIABLES]
 
@@ -147,16 +147,13 @@
     [
         (TO_ONEHOT_ENCODE, [OneHotEncoder()]),
         (TO_SCALE, [StandardScaler()]),
-        (TO_LOG, [LogTransformer(),StandardScaler()]),
+        (TO_LOG, [LogTransformer(), StandardScaler()]),
         (TO_SQRT, [SqrtTransformer(), StandardScaler()]),
     ],
 )
 
 MODEL_PIPELINE = Pipeline(
-    [
-        ("transform_dataframe", mapper),
-        ("fit_model", LogisticRegression())
-    ]
+    [("transform_dataframe", mapper), ("fit_model", LogisticRegression())]
 )
 
 # Train Dataset
diff --git a/predictsignauxfaibles/transformers.py b/predictsignauxfaibles/transformers.py
index 0de599f..ac89cd3 100644
--- a/predictsignauxfaibles/transformers.py
+++ b/predictsignauxfaibles/transformers.py
@@ -1,6 +1,4 @@
-import logging
 import math
-import re
 from typing import List, Callable
 import numpy as np
 import pandas as pd
@@ -10,35 +8,33 @@
 
 
 class SqrtTransformer(BaseEstimator, TransformerMixin):
+    # pylint: disable = E, C
     def __init__(self):
         pass
-    
-    
-    def fit(self, X, y = None):
+
+    def fit(self):
         return self
-    
-    
-    def transform(self, X, y = None):
+
+    def transform(X):
         X_ = X.copy()
         X_ = np.sqrt(X_)
         return X_
 
 
 class LogTransformer(BaseEstimator, TransformerMixin):
+    # pylint: disable=E, C
     def __init__(self):
         pass
-    
-    
-    def fit(self, X, y = None):
+
+    def fit(self):
         return self
-    
-    
-    def transform(self, X, y = None):
+
+    def transform(X):
         X_ = X.copy()
-        X_ = np.log(X_+1)
+        X_ = np.log(X_ + 1)
         return X_
 
-    
+
 def apply_log(number: float) -> float:
     """
     Apply the transformation number -> log(number + 1)
@@ -52,13 +48,15 @@ def apply_sqrt(number: float) -> float:
     """
     return math.sqrt(number)
 
-    
+
 def print_featuring_for_model_conf(featuring: pd.DataFrame) -> str:
     """
     Print the tranformation for each feature to be plugged in a model_conf.py.
     """
-    featuring = featuring[['func', 'feature']].groupby('func').agg(list).reset_index()
-    featuring = {func:feat for (feat, func) in zip(featuring['feature'], featuring['func'])}
+    featuring = featuring[["func", "feature"]].groupby("func").agg(list).reset_index()
+    featuring = {
+        func: feat for (feat, func) in zip(featuring["feature"], featuring["func"])
+    }
     return featuring