Merge pull request #152 from frank1010111/setup-to-pyproject

Setup to pyproject, also integrated all 2.1.4 -> 2.2.1 changes
fbdesignpro · Aug 25, 2023 · 64d4ae6 · 64d4ae6
2 parents 230faa3 + 661c8c9
commit 64d4ae6
Show file tree

Hide file tree

Showing 11 changed files with 124 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -152,3 +152,4 @@ SWEETVIZ_REPORT_COMPARED.html
 SWEETVIZ_REPORT_VERTICAL.html
 docs/images/Art-Targ.psd
 docs/images/Assoc.psd
+UPLOAD_TEST.bat
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Change log
 
+#### 2.2.1 - 2023-08-24 (Major compatibility update)
+
+- **Updated:** Updated the project to use the latest build & packaging pipelines (pyproject.toml)
+- **Updated:** Using the "warnings" library directly instead of np.warningS
+- **Fixed:** "KeyError: None of ['index'] are in the columns" (for pandas > 2.0)
+- **Fixed:** "AttributeError: numpy has no attribute 'warning'" (for numpy > 1.23)
+- **Fixed:** "AttributeError: 'DataFrame' object has no attribute 'iteritems'"
+- **Fixed:** np.bool deprecation warning
+- **Fixed:** Pandas 'mad()' function deprecation warning
+
 #### 2.1.4 - 2022-06-08
 
 - **Fixed:** removed deprecation warnings

diff --git a/README.md b/README.md
@@ -1,4 +1,6 @@
-![v](https://img.shields.io/badge/version-2.1.4-blue) ![v](https://img.shields.io/badge/updated-June%208,%20%202022-green)
+![v](https://img.shields.io/badge/version-2.2.1-blue) ![v](https://img.shields.io/badge/updated-August%2025,%20%202023-green)
+
+## NEWS (August 2023) -  Version 2.2.1: Big compatibility update for python 3.7+ and latest numpy versions!
 
 ![Sweetviz Logo](http://cooltiming.com/SV/logo.png) 
 
@@ -21,7 +23,7 @@ Usage and parameters are described below, [you can also find an article describi
 
 [**Medium Article** describing its features in depth](https://towardsdatascience.com/powerful-eda-exploratory-data-analysis-in-just-two-lines-of-code-using-sweetviz-6c943d32f34)
 
-[![KDNuggets](https://www.kdnuggets.com/images/tkb-2102-g.png)](https://www.kdnuggets.com/2021/03/know-your-data-much-faster-sweetviz-python-library.html)
+[![KDNuggets](https://www.kdnuggets.com/images/tkb-2102-g.png)](https://www.kdnuggets.com/2021/03/know-your-data-much-faster-sweetviz-python-library.html) [![KDNuggets](https://www.kdnuggets.com/images/tkb-2103-g.png)](https://www.kdnuggets.com/2021/03/know-your-data-much-faster-sweetviz-python-library.html)
 
 # Features
 - **Target analysis** 
@@ -39,6 +41,7 @@ Usage and parameters are described below, [you can also find an article describi
     - min/max/range, quartiles, mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
 
 ## New & notable
+- Version 2.2: Big compatibility update for python 3.7+ and numpy versions
 - Version 2.1: **Comet.ml** support
 - Version 2.0: **Jupyter, Colab & other notebook** support, report **scaling & vertical layout**  
 
@@ -254,11 +257,17 @@ I definitely welcome the help I can get on this project, simply get in touch on
 Please note that after a hectic development period, the code itself right now needs a bit of cleanup. :)
 
 # Special thanks & related materials
+### Contributors
+**A very special thanks to everyone who have contributed on Github, through reports, feedback and commits!** I want to give a special shout out to **Frank Male** who has been of tremendous help for fixing issues and setting up the new build pipeline for 2.2.0.
+
+[![Contributors](https://contrib.rocks/image?repo=fbdesignpro/sweetviz)](https://github.com/fbdesignpro/sweetviz/graphs/contributors)
+
+Made with [contrib.rocks](https://contrib.rocks).
+### Related materials
 I want Sweetviz to be a hub of the best of what's out there, a way to get the most valuable information and visualization, without reinventing the wheel.
 
 As such, I want to point some of those great resources that were inspiring and integrated into Sweetviz:
 - [Pandas-Profiling](https://github.com/pandas-profiling/pandas-profiling) was the original inspiration for this project. Some of its type-detection code was included in Sweetviz.
 - [Shaked Zychlinski: The Search for Categorical Correlation](https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9) is a great article about different types of variable interactions that was the basis of that analysis in Sweetviz.
 - [Drazen Zaric: Better Heatmaps and Correlation Matrix Plots in Python](https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec) was the basis for our association graphs.
 
-**And of course, very special thanks to everyone who have contributed on Github, through reports, feedback and commits!**
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,74 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+include = ["sweetviz"]
+
+[project]
+name = "sweetviz"
+version = "2.2.1"
+authors = [
+  { name = "Francois Bertrand", email = "[email protected]" },
+]
+description = "A pandas-based library to visualize and compare datasets."
+license = { file = "LICENSE" }
+readme = "README.md"
+requires-python = ">=3.7"
+
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Development Status :: 5 - Production/Stable",
+    "Topic :: Scientific/Engineering :: Visualization",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+keywords=[
+    "pandas",
+    "data-science",
+    "data-analysis",
+    "python",
+    "eda",
+]
+dependencies = [
+    'pandas>=0.25.3,!=1.0.0,!=1.0.1,!=1.0.2',
+    'numpy>=1.16.0',
+    'matplotlib>=3.1.3',
+    'tqdm>=4.43.0',
+    'scipy>=1.3.2',
+    'jinja2>=2.11.1',
+    'importlib_resources>=1.2.0',
+    'importlib_metadata;python_version<"3.8"',
+]
+
+[project.optional-dependencies]
+test = [
+  "pytest >=6",
+  "pytest-cov >=3",
+]
+dev = [
+  "pytest >=6",
+  "pytest-cov >=3",
+]
+docs = [
+  "sphinx>=4.0",
+  "myst_parser>=0.13",
+  "sphinx_book_theme>=0.1.0",
+  "sphinx_copybutton",
+  "sphinx_autodoc_typehints",
+  "furo",
+]
+
+[project.urls]
+Homepage = "https://github.com/fbdesignpro/sweetviz"
+"Bug Tracker" = "https://github.com/fbdesignpro/sweetviz/issues"
+Discussions = "https://github.com/fbdesignpro/sweetviz/discussions"
+Changelog = "https://github.com/fbdesignpro/sweetviz/releases"
diff --git a/setup.py b/setup.py
diff --git a/sweetviz/__init__.py b/sweetviz/__init__.py
@@ -1,9 +1,15 @@
 # sweetviz public interface
 # -----------------------------------------------------------------------------------
-__title__ = 'sweetviz'
-__version__ = "2.1.4"
-__author__ = "Francois Bertrand"
-__license__ = 'MIT'
+try:
+    from importlib.metadata import metadata # Python 3.8+
+except ImportError:
+    from importlib_metadata import metadata # Python 3.7
+
+_metadata = metadata("sweetviz")
+__title__ = _metadata["name"]
+__version__ = _metadata["version"]
+__author__ = _metadata["Author-email"]
+__license__ = "MIT"
 
 # These are the main API functions
 from sweetviz.sv_public import analyze, compare, compare_intra

diff --git a/sweetviz/graph_numeric.py b/sweetviz/graph_numeric.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mtick
+import warnings
 
 from sweetviz.config import config
 from sweetviz import sv_html_formatters
@@ -67,10 +68,10 @@ def __init__(self, which_graph: str, to_process: FeatureToProcess):
 
         gap_percent = config["Graphs"].getfloat("summary_graph_categorical_gap")
 
-        np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
+        warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
         self.hist_specs = axs.hist(plot_data, weights = normalizing_weights, bins=self.num_bins, \
                                    rwidth = (100.0 - gap_percent) / 100.0)
-        np.warnings.filterwarnings('once', category=np.VisibleDeprecationWarning)
+        warnings.filterwarnings('once', category=np.VisibleDeprecationWarning)
 
         bin_limits = self.hist_specs[1]
         num_bins = len(bin_limits) - 1

diff --git a/sweetviz/series_analyzer.py b/sweetviz/series_analyzer.py
@@ -5,6 +5,7 @@
 import sweetviz.series_analyzer_cat
 import sweetviz.series_analyzer_text
 
+from distutils.version import LooseVersion
 
 def get_counts(series: pd.Series) -> dict:
     # The value_counts() function is used to get a Series containing counts of unique values.
@@ -17,7 +18,12 @@ def get_counts(series: pd.Series) -> dict:
         else:
             value_counts_without_nan = value_counts_with_nan
     else:
-        value_counts_without_nan = (value_counts_with_nan.reset_index().dropna().set_index("index").iloc[:, 0])
+        reset_value_counts = value_counts_with_nan.reset_index()
+        # Force column naming behavior to be similar for value_counts() being reset between 1.x and 2.x.: make sure col 0 is "index" and 1 is series.name
+        # -> This is a no-op in Pandas 1.x
+        reset_value_counts.rename(columns={reset_value_counts.columns[0]: "index", reset_value_counts.columns[1]:series.name}, inplace=True)
+
+        value_counts_without_nan = (reset_value_counts.dropna().set_index("index").iloc[:, 0])
     # print(value_counts_without_nan.index.dtype.name)
 
     # IGNORING NAN FOR NOW AS IT CAUSES ISSUES [FIX]

diff --git a/sweetviz/series_analyzer_numeric.py b/sweetviz/series_analyzer_numeric.py
@@ -22,7 +22,8 @@ def do_stats_numeric(series: pd.Series, updated_dict: dict):
     stats["kurtosis"] = series.kurt()
     stats["skewness"] = series.skew()
     stats["sum"] = series.sum()
-    stats["mad"] = series.mad()
+    # MAD was unused!!!
+    # stats["mad"] = (series - series.mean()).abs().mean() # deprecated: series.mad()
     stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
     return updated_dict
 

diff --git a/sweetviz/sv_math.py b/sweetviz/sv_math.py
@@ -4,7 +4,7 @@
 
 def count_fraction_of_true(series: pd.Series):
     # We are assuming this is called by a Boolean series
-    if series.dtype != np.bool and series.dtype != "Int64":
+    if series.dtype != bool and series.dtype != "Int64":
         raise ValueError
     num_true = series.sum()
     total = float(series.count())

diff --git a/sweetviz/sv_public.py b/sweetviz/sv_public.py
@@ -39,6 +39,10 @@ def compare_intra(source_df: pd.DataFrame,
 
     data_true = source_df[condition_series]
     data_false = source_df[condition_series == False]
+    if len(data_false) == 0:
+        raise ValueError('compare_intra(): FALSE dataset is empty, nothing to compare!')
+    if len(data_true) == 0:
+        raise ValueError('compare_intra(): TRUE dataset is empty, nothing to compare!')
     report = sweetviz.DataframeReport([data_true, names[0]], target_feat,
                                       [data_false, names[1]],
                                       pairwise_analysis, feat_cfg)