From e1edb18c4a184e771de577eca6ab24c77fe38339 Mon Sep 17 00:00:00 2001 From: Chris Lemke <11752694+chrislemke@users.noreply.github.com> Date: Tue, 17 Jan 2023 09:59:31 +0000 Subject: [PATCH] refactor: add init file for easier usage of transformers (#45) --- docs/README.md | 6 +-- examples/playground.ipynb | 42 +++++++++---------- src/sk_transformers/__init__.py | 26 ++++++++++++ src/sk_transformers/datetime_transformer.py | 4 +- src/sk_transformers/deep_transformer.py | 2 +- src/sk_transformers/encoder_transformer.py | 2 +- src/sk_transformers/generic_transformer.py | 24 +++++------ src/sk_transformers/number_transformer.py | 2 +- src/sk_transformers/string_transformer.py | 10 ++--- .../test_datetime_transformer.py | 5 +-- .../test_transformer/test_deep_transformer.py | 2 +- .../test_encoder_transformer.py | 2 +- .../test_generic_transformer.py | 2 +- .../test_number_transformer.py | 2 +- .../test_string_transformer.py | 2 +- 15 files changed, 77 insertions(+), 56 deletions(-) create mode 100644 src/sk_transformers/__init__.py diff --git a/docs/README.md b/docs/README.md index fa1e997..df7b6fd 100644 --- a/docs/README.md +++ b/docs/README.md @@ -74,7 +74,7 @@ Let's assume you want to use some method from [NumPy's mathematical functions, t use the [`MathExpressionTransformer`](https://chrislemke.github.io/sk-transformers/number_transformer-reference/#sk-transformers.transformer.number_transformer.MathExpressionTransformer). ```python import pandas as pd -from sk_transformers.number_transformer import MathExpressionTransformer +from sk_transformers import MathExpressionTransformer X = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) transformer = MathExpressionTransformer([("foo", "np.sum", "bar", {"axis": 0})]) @@ -91,8 +91,8 @@ In the next example, we additionally add the [`MapTransformer`](https://chrislem Together with [scikit-learn's pipelines](https://scikit-learn.org/stable/modules/compose.html#combining-estimators) it would look like this: ```python import pandas as pd -from sk_transformers.number_transformer import MathExpressionTransformer -from sk_transformers.generic_transformer import MapTransformer +from sk_transformers import MathExpressionTransformer +from sk_transformers import MapTransformer from sklearn.pipeline import Pipeline X = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) diff --git a/examples/playground.ipynb b/examples/playground.ipynb index 0c2af26..7326001 100644 --- a/examples/playground.ipynb +++ b/examples/playground.ipynb @@ -56,7 +56,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.datetime_transformer import DurationCalculatorTransformer\n", + "from sk_transformers import DurationCalculatorTransformer\n", "\n", "X = pd.DataFrame(\n", " {\n", @@ -85,7 +85,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.datetime_transformer import TimestampTransformer\n", + "from sk_transformers import TimestampTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"1960-01-01\", \"1970-01-01\", \"1990-01-01\"]})\n", "transformer = TimestampTransformer([\"foo\"])\n", @@ -123,7 +123,7 @@ "import numpy as np\n", "import pandas as pd\n", "from pytorch_widedeep.datasets import load_adult\n", - "from sk_transformers.deep_transformer import ToVecTransformer\n", + "from sk_transformers import ToVecTransformer\n", "\n", "df = load_adult(as_frame=True)\n", "df[\"target\"] = (df[\"income\"].apply(lambda x: \">50K\" in x)).astype(int)\n", @@ -168,7 +168,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.encoder_transformer import MeanEncoderTransformer\n", + "from sk_transformers import MeanEncoderTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"a\", \"b\", \"a\", \"c\", \"b\", \"a\", \"c\", \"a\", \"b\", \"c\"]})\n", "y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0, 1, 0])\n", @@ -204,7 +204,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import AggregateTransformer\n", + "from sk_transformers import AggregateTransformer\n", "\n", "X = pd.DataFrame(\n", " {\n", @@ -234,7 +234,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import ColumnDropperTransformer\n", + "from sk_transformers import ColumnDropperTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [1, 2, 3], \"bar\": [4, 5, 6]})\n", "transformer = ColumnDropperTransformer([\"foo\"])\n", @@ -259,7 +259,7 @@ "source": [ "import numpy as np\n", "import pandas as pd\n", - "from sk_transformers.generic_transformer import DtypeTransformer\n", + "from sk_transformers import DtypeTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [1, 2, 3], \"bar\": [\"a\", \"a\", \"b\"]})\n", "transformer = DtypeTransformer([(\"foo\", np.float32), (\"bar\", \"category\")])\n", @@ -285,7 +285,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import FunctionsTransformer\n", + "from sk_transformers import FunctionsTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [1, 2, 3], \"bar\": [4, 5, 6]})\n", "transformer = FunctionsTransformer([(\"foo\", np.log1p, None), (\"bar\", np.sqrt, None)])\n", @@ -309,7 +309,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import MapTransformer\n", + "from sk_transformers import MapTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [1, 2, 3], \"bar\": [4, 5, 6]})\n", "transformer = MapTransformer([(\"foo\", lambda x: x + 1)])\n", @@ -335,7 +335,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import LeftJoinTransformer\n", + "from sk_transformers import LeftJoinTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"A\", \"B\", \"C\", \"A\", \"C\"]})\n", "lookup_df = pd.Series([1, 2, 3], index=[\"A\", \"B\", \"C\"], name=\"values\")\n", @@ -359,7 +359,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sk_transformers.generic_transformer import NaNTransformer\n", + "from sk_transformers import NaNTransformer\n", "import pandas as pd\n", "import numpy as np\n", "\n", @@ -389,7 +389,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import QueryTransformer\n", + "from sk_transformers import QueryTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [1, 8, 3, 6, 5, 4, 7, 2]})\n", "transformer = QueryTransformer([\"foo > 4\"])\n", @@ -418,7 +418,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sk_transformers.generic_transformer import ValueIndicatorTransformer\n", + "from sk_transformers import ValueIndicatorTransformer\n", "import pandas as pd\n", "\n", "X = pd.DataFrame({\"foo\": [1, -999, 3], \"bar\": [\"a\", \"-999\", \"c\"]})\n", @@ -446,7 +446,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.generic_transformer import ValueReplacerTransformer\n", + "from sk_transformers import ValueReplacerTransformer\n", "\n", "X = pd.DataFrame(\n", " {\"foo\": [\"0000-01-01\", \"2022/01/08\", \"bar\", \"1982-12-7\", \"28-09-2022\"]}\n", @@ -492,7 +492,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.number_transformer import MathExpressionTransformer\n", + "from sk_transformers import MathExpressionTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [1, 2, 3], \"bar\": [4, 5, 6]})\n", "transformer = MathExpressionTransformer([(\"foo\", \"np.sum\", \"bar\", {\"axis\": 0})])\n", @@ -524,7 +524,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.string_transformer import EmailTransformer\n", + "from sk_transformers import EmailTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"person-123@test.com\"]})\n", "transformer = EmailTransformer([\"foo\"])\n", @@ -550,7 +550,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.string_transformer import IPAddressEncoderTransformer\n", + "from sk_transformers import IPAddressEncoderTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"192.168.1.1\", \"2001:0db8:3c4d:0015:0000:0000:1a2f:1a2b\"]})\n", "transformer = IPAddressEncoderTransformer([\"foo\"])\n", @@ -574,7 +574,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.string_transformer import PhoneTransformer\n", + "from sk_transformers import PhoneTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"+49123456789\", \"0044987654321\", \"3167891234\"]})\n", "transformer = PhoneTransformer([\"foo\"])\n", @@ -598,7 +598,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.string_transformer import StringSimilarityTransformer\n", + "from sk_transformers import StringSimilarityTransformer\n", "\n", "X = pd.DataFrame(\n", " {\n", @@ -630,7 +630,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "from sk_transformers.string_transformer import StringSlicerTransformer\n", + "from sk_transformers import StringSlicerTransformer\n", "\n", "X = pd.DataFrame({\"foo\": [\"abc\", \"def\", \"ghi\"], \"bar\": [\"jkl\", \"mno\", \"pqr\"]})\n", "transformer = StringSlicerTransformer([(\"foo\", (0, 3, 2)), (\"bar\", (2,))])\n", @@ -662,7 +662,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]" }, "vscode": { "interpreter": { diff --git a/src/sk_transformers/__init__.py b/src/sk_transformers/__init__.py new file mode 100644 index 0000000..0745b55 --- /dev/null +++ b/src/sk_transformers/__init__.py @@ -0,0 +1,26 @@ +from sk_transformers.datetime_transformer import ( + DurationCalculatorTransformer, + TimestampTransformer, +) +from sk_transformers.deep_transformer import ToVecTransformer +from sk_transformers.encoder_transformer import MeanEncoderTransformer +from sk_transformers.generic_transformer import ( + AggregateTransformer, + ColumnDropperTransformer, + DtypeTransformer, + FunctionsTransformer, + LeftJoinTransformer, + MapTransformer, + NaNTransformer, + QueryTransformer, + ValueIndicatorTransformer, + ValueReplacerTransformer, +) +from sk_transformers.number_transformer import MathExpressionTransformer +from sk_transformers.string_transformer import ( + EmailTransformer, + IPAddressEncoderTransformer, + PhoneTransformer, + StringSimilarityTransformer, + StringSlicerTransformer, +) diff --git a/src/sk_transformers/datetime_transformer.py b/src/sk_transformers/datetime_transformer.py index a2af5d3..a94fa03 100644 --- a/src/sk_transformers/datetime_transformer.py +++ b/src/sk_transformers/datetime_transformer.py @@ -13,7 +13,7 @@ class DurationCalculatorTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.datetime_transformer import DurationCalculatorTransformer + from sk_transformers import DurationCalculatorTransformer X = pd.DataFrame( { @@ -79,7 +79,7 @@ class TimestampTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.datetime_transformer import TimestampTransformer + from sk_transformers import TimestampTransformer X = pd.DataFrame({"foo": ["1960-01-01", "1970-01-01", "1990-01-01"]}) transformer = TimestampTransformer(["foo"]) diff --git a/src/sk_transformers/deep_transformer.py b/src/sk_transformers/deep_transformer.py index b2ef113..ce4d6d8 100644 --- a/src/sk_transformers/deep_transformer.py +++ b/src/sk_transformers/deep_transformer.py @@ -27,7 +27,7 @@ class ToVecTransformer(BaseEstimator, TransformerMixin): import numpy as np import pandas as pd from pytorch_widedeep.datasets import load_adult - from sk_transformers.deep_transformer import ToVecTransformer + from sk_transformers import ToVecTransformer df = load_adult(as_frame=True) df["target"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int) diff --git a/src/sk_transformers/encoder_transformer.py b/src/sk_transformers/encoder_transformer.py index c9b2486..d595c32 100644 --- a/src/sk_transformers/encoder_transformer.py +++ b/src/sk_transformers/encoder_transformer.py @@ -12,7 +12,7 @@ class MeanEncoderTransformer(BaseEstimator, TransformerMixin): Example: ```python import pandas as pd - from sk_transformers.encoder_transformer import MeanEncoderTransformer + from sk_transformers import MeanEncoderTransformer X = pd.DataFrame({"foo": ["a", "b", "a", "c", "b", "a", "c", "a", "b", "c"]}) y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) diff --git a/src/sk_transformers/generic_transformer.py b/src/sk_transformers/generic_transformer.py index 33aad86..04cba3a 100644 --- a/src/sk_transformers/generic_transformer.py +++ b/src/sk_transformers/generic_transformer.py @@ -16,7 +16,7 @@ class DtypeTransformer(BaseTransformer): ```python import numpy as np import pandas as pd - from sk_transformers.generic_transformer import DtypeTransformer + from sk_transformers import DtypeTransformer X = pd.DataFrame({"foo": [1, 2, 3], "bar": ["a", "a", "b"]}) transformer = DtypeTransformer([("foo", np.float32), ("bar", "category")]) @@ -69,7 +69,7 @@ class AggregateTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.generic_transformer import AggregateTransformer + from sk_transformers import AggregateTransformer X = pd.DataFrame( { @@ -161,7 +161,7 @@ class FunctionsTransformer(BaseTransformer): ```python import numpy as np import pandas as pd - from sk_transformers.generic_transformer import FunctionsTransformer + from sk_transformers import FunctionsTransformer X = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) transformer = FunctionsTransformer([("foo", np.log1p, None), ("bar", np.sqrt, None)]) @@ -216,7 +216,7 @@ class MapTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.generic_transformer import MapTransformer + from sk_transformers import MapTransformer X = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) transformer = MapTransformer([("foo", lambda x: x + 1)]) @@ -263,7 +263,7 @@ class ColumnDropperTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.generic_transformer import ColumnDropperTransformer + from sk_transformers import ColumnDropperTransformer X = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) transformer = ColumnDropperTransformer(["foo"]) @@ -304,7 +304,7 @@ class NaNTransformer(BaseTransformer): Example: ```python - from sk_transformers.generic_transformer import NaNTransformer + from sk_transformers import NaNTransformer import pandas as pd import numpy as np @@ -357,7 +357,7 @@ class ValueIndicatorTransformer(BaseTransformer): Example: ```python - from sk_transformers.generic_transformer import ValueIndicatorTransformer + from sk_transformers import ValueIndicatorTransformer import pandas as pd X = pd.DataFrame({"foo": [1, -999, 3], "bar": ["a", "-999", "c"]}) @@ -410,14 +410,12 @@ class QueryTransformer(BaseTransformer): """Applies a list of queries to a dataframe. If it operates on a dataset used for supervised learning this transformer should be applied on the dataframe containing `X` and `y`. So removing of columns by queries also - removes the corresponding `y` value. Read more about queries [here](https:/ - - /pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html). + removes the corresponding `y` value. Example: ```python import pandas as pd - from sk_transformers.generic_transformer import QueryTransformer + from sk_transformers import QueryTransformer X = pd.DataFrame({"foo": [1, 8, 3, 6, 5, 4, 7, 2]}) transformer = QueryTransformer(["foo > 4"]) @@ -468,7 +466,7 @@ class ValueReplacerTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.generic_transformer import ValueReplacerTransformer + from sk_transformers import ValueReplacerTransformer X = pd.DataFrame( {"foo": ["0000-01-01", "2022/01/08", "bar", "1982-12-7", "28-09-2022"]} @@ -554,7 +552,7 @@ class LeftJoinTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.generic_transformer import LeftJoinTransformer + from sk_transformers import LeftJoinTransformer X = pd.DataFrame({"foo": ["A", "B", "C", "A", "C"]}) lookup_df = pd.Series([1, 2, 3], index=["A", "B", "C"], name="values") diff --git a/src/sk_transformers/number_transformer.py b/src/sk_transformers/number_transformer.py index f4a4327..15d1045 100644 --- a/src/sk_transformers/number_transformer.py +++ b/src/sk_transformers/number_transformer.py @@ -20,7 +20,7 @@ class MathExpressionTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.number_transformer import MathExpressionTransformer + from sk_transformers import MathExpressionTransformer X = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) transformer = MathExpressionTransformer([("foo", "np.sum", "bar", {"axis": 0})]) diff --git a/src/sk_transformers/string_transformer.py b/src/sk_transformers/string_transformer.py index abf8654..98ee7de 100644 --- a/src/sk_transformers/string_transformer.py +++ b/src/sk_transformers/string_transformer.py @@ -23,7 +23,7 @@ class IPAddressEncoderTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.string_transformer import IPAddressEncoderTransformer + from sk_transformers import IPAddressEncoderTransformer X = pd.DataFrame({"foo": ["192.168.1.1", "2001:0db8:3c4d:0015:0000:0000:1a2f:1a2b"]}) transformer = IPAddressEncoderTransformer(["foo"]) @@ -100,7 +100,7 @@ class EmailTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.string_transformer import EmailTransformer + from sk_transformers import EmailTransformer X = pd.DataFrame({"foo": ["person-123@test.com"]}) transformer = EmailTransformer(["foo"]) @@ -186,7 +186,7 @@ class StringSimilarityTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.string_transformer import StringSimilarityTransformer + from sk_transformers import StringSimilarityTransformer X = pd.DataFrame( { @@ -255,7 +255,7 @@ class PhoneTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.string_transformer import PhoneTransformer + from sk_transformers import PhoneTransformer X = pd.DataFrame({"foo": ["+49123456789", "0044987654321", "3167891234"]}) transformer = PhoneTransformer(["foo"]) @@ -344,7 +344,7 @@ class StringSlicerTransformer(BaseTransformer): Example: ```python import pandas as pd - from sk_transformers.string_transformer import StringSlicerTransformer + from sk_transformers import StringSlicerTransformer X = pd.DataFrame({"foo": ["abc", "def", "ghi"], "bar": ["jkl", "mno", "pqr"]}) transformer = StringSlicerTransformer([("foo", (0, 3, 2)), ("bar", (2,))]) diff --git a/tests/test_transformer/test_datetime_transformer.py b/tests/test_transformer/test_datetime_transformer.py index f3719ef..6dcf5ba 100644 --- a/tests/test_transformer/test_datetime_transformer.py +++ b/tests/test_transformer/test_datetime_transformer.py @@ -2,10 +2,7 @@ import pytest from sklearn.pipeline import make_pipeline -from sk_transformers.datetime_transformer import ( - DurationCalculatorTransformer, - TimestampTransformer, -) +from sk_transformers import DurationCalculatorTransformer, TimestampTransformer # pylint: disable=missing-function-docstring, missing-class-docstring diff --git a/tests/test_transformer/test_deep_transformer.py b/tests/test_transformer/test_deep_transformer.py index 8723143..3bbf2c1 100644 --- a/tests/test_transformer/test_deep_transformer.py +++ b/tests/test_transformer/test_deep_transformer.py @@ -1,6 +1,6 @@ from sklearn.pipeline import make_pipeline -from sk_transformers.deep_transformer import ToVecTransformer +from sk_transformers import ToVecTransformer # pylint: disable=missing-function-docstring, missing-class-docstring diff --git a/tests/test_transformer/test_encoder_transformer.py b/tests/test_transformer/test_encoder_transformer.py index 415ecf9..41e7aa6 100644 --- a/tests/test_transformer/test_encoder_transformer.py +++ b/tests/test_transformer/test_encoder_transformer.py @@ -1,7 +1,7 @@ import numpy as np from sklearn.pipeline import make_pipeline -from sk_transformers.encoder_transformer import MeanEncoderTransformer +from sk_transformers import MeanEncoderTransformer # pylint: disable=missing-function-docstring, missing-class-docstring diff --git a/tests/test_transformer/test_generic_transformer.py b/tests/test_transformer/test_generic_transformer.py index 7d69d9b..c405686 100644 --- a/tests/test_transformer/test_generic_transformer.py +++ b/tests/test_transformer/test_generic_transformer.py @@ -3,7 +3,7 @@ import pytest from sklearn.pipeline import make_pipeline -from sk_transformers.generic_transformer import ( +from sk_transformers import ( AggregateTransformer, ColumnDropperTransformer, DtypeTransformer, diff --git a/tests/test_transformer/test_number_transformer.py b/tests/test_transformer/test_number_transformer.py index 1c91757..eeb3aab 100644 --- a/tests/test_transformer/test_number_transformer.py +++ b/tests/test_transformer/test_number_transformer.py @@ -2,7 +2,7 @@ import pytest from sklearn.pipeline import make_pipeline -from sk_transformers.number_transformer import MathExpressionTransformer +from sk_transformers import MathExpressionTransformer # pylint: disable=missing-function-docstring, missing-class-docstring diff --git a/tests/test_transformer/test_string_transformer.py b/tests/test_transformer/test_string_transformer.py index c5c6454..8263584 100644 --- a/tests/test_transformer/test_string_transformer.py +++ b/tests/test_transformer/test_string_transformer.py @@ -2,7 +2,7 @@ import pandas as pd from sklearn.pipeline import make_pipeline -from sk_transformers.string_transformer import ( +from sk_transformers import ( EmailTransformer, IPAddressEncoderTransformer, PhoneTransformer,