From 4e8f1d6213042a9184c0e203511cc452fbb49c5b Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Apr 2024 12:01:14 +0200 Subject: [PATCH] Tabmat v4 (#286) * Minimal implementation (tests green) * Remove sum method and rely on np.sum * Force DenseMatrix to always be 2-dimensional * Add __repr__ and __str__ methods * Fix as_mx * Fix ufunc return value * Wrap SparseMatrix, too * Demo of how the ufunc interface can be implemented * Do not subclass csc_matrix * Demonstrate binary ufuncs for sparse * Add tocsc method * Fix type checks * Minor improvements * ufunc support for categoricals * Remove __array_ufunc__ interface * Remove numpy operator mixin * Add hstack function * Add method for unpacking underlying array * Add __matmul__ methods to SparseMatrix * Stricter and more consistent indexing * Be consistent when instantiating from 1d arrays * Add column name metadata to `tabmat` matrices (#278) * Add column name getters * Matrix names are also combined * Add names to constructors * Add indexing support for column names * Remove unnecessary code * Better default column names * Reduce code duplication * Saner defaults * Add convenient getters and setters * Fix indexing * Smarter setter for categorical matrices * Add tests * Fix subsetting with np.newaxis * Remove the walrus :( * Fix test * Fix indexing with np.ix_ * Propagate column names where it makes sense * Fix merge mistake * Add changelog entry * Matrices from formulas (#267) * Add an experimental tabmat materializer class * Nicer way of handling interactions * Have proper column names [skip ci] * Make dummy ordering consistent with pandas [skip ci] * Fix mistake in categorical interactions [skip ci] * Add formulaic to environment files Have not added to the conda recipe yet. Should probably be optional. * Add from_formula constructor * Add some tests * Add more tests * Major refactoring - simplify categorical interactions - NaNs in categoricals should be handled correctly - parity with formulaic in categorical names * Make name formatting custommizable - interaction_separator - categorical_format - intercept_name * Add formulaic to conda recipe * Implement `C()` function to convert to categoricals * Auto-convert strings to categories * Fix C() not working from materializer interface * Add the pandasmaterializer tests from formulaic * Add formulaic to setup.py deps * Implement suggestions from code review * Clean up code - Add docstrings - Add type hints - Rename some classes * Pin formulaic minimum version * Add support for architectures not supported by xsimd (#262) * Release 3.1.9 (#263) * Pre-commit autoupdate (#264) Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> * Add params for density and cardinality thresholds * Skip python 3.6 build * Refactor to avoid circular imports * Interaction of dropped and NA is dropped * Add type hint for context * Add unit tests for interactable vectors * Add more checks * Change argument name * Make C() stateful (remember levels) * Add test for categorizer state * More correct handling of encoding categoricals * Make adding an intercept implicitly parametrizable Default is False * Add na_action parameter to constrictor * Add test for sparse numerical columns * Add option to not add the constant column * Pre-commit autoupdate (#274) * Pre-commit autoupdate (#276) Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> * Bump pypa/gh-action-pypi-publish from 1.8.6 to 1.8.7 (#277) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.6 to 1.8.7. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.6...v1.8.7) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump pypa/gh-action-pypi-publish from 1.8.7 to 1.8.8 (#279) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.7 to 1.8.8. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.7...v1.8.8) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Bump pypa/cibuildwheel from 2.13.1 to 2.14.1 (#280) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.13.1 to 2.14.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.13.1...v2.14.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Minimal implementation (tests green) * Remove sum method and rely on np.sum * Force DenseMatrix to always be 2-dimensional * Add __repr__ and __str__ methods * Fix as_mx * Fix ufunc return value * Wrap SparseMatrix, too * Demo of how the ufunc interface can be implemented * Do not subclass csc_matrix * Improve the performance of `from_pandas` in the case of low-cardinality categoricals (#275) * Improve the performance of `from_pandas` * Update changelog according to review * Add benchmark data to .gitignore (#282) * Demonstrate binary ufuncs for sparse * Add tocsc method * Fix type checks * Minor improvements * ufunc support for categoricals * Remove __array_ufunc__ interface * Remove numpy operator mixin * Add hstack function * Add method for unpacking underlying array * Add __matmul__ methods to SparseMatrix * Stricter and more consistent indexing * Be consistent when instantiating from 1d arrays * Adjust tests to work with v4 * Fix type hints * Add changelog entry * term and column names for formula-based matrices * Fix handling of formula-based names * Add tests for formula-based names --------- Signed-off-by: dependabot[bot] Co-authored-by: Martin Stancsics Co-authored-by: Uwe L. Korn Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Apply Matthias' suggestions Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com> * Allow missing values in `CategoricalMatrix` (#281) * Add missing support to categoricals * Rename functions * Parametrize missing behavior in constructors * Return a maskedarray from recover_orig * Propagate missing_method when indexing * Add tests * Template all the things! * Privatize has_missing attribute * Add changelog entry * Add option to treat missing values as a category * Update changelog * Raise if the missing category already exists * Add tests for missing name and raise on existing * Don't skip tests (they are fast) * Apply suggestions from review * Fix indxing * Fix intercept name in formulas * Add missing cateegorical functinoality to formulas * Much cooler handlong of missing categoricals * Add changelog entry * Correctly create missing category from model_spec (#297) * pyupgrade 3.9 * make ruff and mypy happy * bump minimum formulaic version (stateful transforms) * add test case with custom cat format * pin formulaic minimum version to 0.6 (#340) * cosmetics * Raise for unseen categories when materializing from an existing `ModelSpec` (#341) * Raise error on unseen levels when materializing * Fix test for unseen categories * Add test for raising on unseen categories * Properly handle missings when checking for unseen * Expand test for unseen missings * Improve attribute name * Add comment about dropping missings in tests for new levels * consistent tense * typo * slightly improve wording * Describe breaking change * improve wording * review comments * add change from #356 * fix * set default context to None * add scope to other test, too * tiny docstring cosmetics * remove duplicate . [skip-ci] * more docstring formatting --------- Signed-off-by: dependabot[bot] Co-authored-by: Matthias Schmidtblaicher <42544829+MatthiasSchmidtblaicherQC@users.noreply.github.com> Co-authored-by: Uwe L. Korn Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Marc-Antoine Schmidt Co-authored-by: Matthias Schmidtblaicher Co-authored-by: Martin Stancsics --- CHANGELOG.rst | 20 +- conda.recipe/meta.yaml | 1 + environment-win.yml | 1 + environment.yml | 1 + setup.py | 2 +- src/tabmat/__init__.py | 7 +- src/tabmat/categorical_matrix.py | 342 +++++-- src/tabmat/constructor.py | 180 +++- src/tabmat/constructor_util.py | 49 + src/tabmat/dense_matrix.py | 208 +++- src/tabmat/ext/cat_split_helpers-tmpl.cpp | 105 +- src/tabmat/ext/categorical.pyx | 70 +- src/tabmat/ext/split.pyx | 50 +- src/tabmat/formula.py | 783 ++++++++++++++ src/tabmat/matrix_base.py | 70 ++ src/tabmat/sparse_matrix.py | 276 ++++- src/tabmat/split_matrix.py | 111 +- src/tabmat/standardized_mat.py | 73 +- src/tabmat/util.py | 48 + tests/test_categorical_matrix.py | 188 +++- tests/test_formula.py | 1128 +++++++++++++++++++++ tests/test_matrices.py | 284 +++++- tests/test_split_matrix.py | 105 +- 23 files changed, 3781 insertions(+), 321 deletions(-) create mode 100644 src/tabmat/constructor_util.py create mode 100644 src/tabmat/formula.py create mode 100644 tests/test_formula.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 40843646..5ecba596 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,17 +7,29 @@ Changelog ========= -Unreleased ----------- +4.0.0 - 2024-04-23 +------------------ + +**Breaking changes**: + +- To unify the API, :class:`DenseMatrix` does not inherit from :class:`np.ndarray` anymore. To convert a :class:`DenseMatrix` to a :class:`np.ndarray`, use :meth:`DenseMatrix.unpack`. +- Similarly, :class:`SparseMatrix` does not inherit from :class:`sps.csc_matrix` anymore. To convert a :class:`SparseMatrix` to a :class:`sps.csc_matrix`, use :meth:`SparseMatrix.unpack`. + +**New features:** + +- Added column name and term name metadata to :class:`MatrixBase` objects. These are automatically populated when initializing a :class:`MatrixBase` from a :class:`pandas.DataFrame`. In addition, they can be accessed and modified via the :attr:`MatrixBase.column_names` and :attr:`MatrixBase.term_names` properties. +- Added a formula interface for creating tabmat matrices from pandas data frames. See :func:`tabmat.from_formula` for details. +- Added support for missing values in :class:`CategoricalMatrix` by either creating a separate category for them or treating them as all-zero rows. +- Added support for handling missing categorical values in pandas data frames. **Bug fix:** -- Added cython compiler directive legacy_implicit_noexcept = True to fix performance regression with cython 3. +- Added cython compiler directive ``legacy_implicit_noexcept = True`` to fix performance regression with cython 3. **Other changes:** - Refactored the pre-commit hooks to use ruff. -- Refactored CategoricalMatrix's transpose_matvec to be deterministic when using OpenMP. +- Refactored :meth:`CategoricalMatrix.transpose_matvec` to be deterministic when using OpenMP. - Adjusted transformation to sparse format in :func:`tabmat.from_pandas` to future changes in pandas. 3.1.13 - 2023-10-17 diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 8a3d8a1c..7f5d81c9 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -38,6 +38,7 @@ requirements: - {{ pin_compatible('numpy') }} - pandas - scipy + - formulaic>=0.6 test: requires: diff --git a/environment-win.yml b/environment-win.yml index e60e4151..dd88ce10 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -5,6 +5,7 @@ channels: dependencies: - libblas>=0=*mkl - pandas + - formulaic>=0.6 # development tools - click diff --git a/environment.yml b/environment.yml index 8ac5c887..10d7d402 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: - nodefaults dependencies: - pandas + - formulaic>=0.6 # development tools - click diff --git a/setup.py b/setup.py index d58ac41e..6baffa5b 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["numpy", "pandas", "scipy"], + install_requires=["numpy", "pandas", "scipy", "formulaic>=0.6"], python_requires=">=3.9", ext_modules=cythonize( ext_modules, diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index db6d7dcc..bd7ce292 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -1,11 +1,11 @@ import importlib.metadata from .categorical_matrix import CategoricalMatrix -from .constructor import from_csc, from_pandas +from .constructor import from_csc, from_formula, from_pandas from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix -from .split_matrix import SplitMatrix +from .split_matrix import SplitMatrix, as_tabmat, hstack from .standardized_mat import StandardizedMatrix try: @@ -21,5 +21,8 @@ "SplitMatrix", "CategoricalMatrix", "from_csc", + "from_formula", "from_pandas", + "as_tabmat", + "hstack", ] diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 1d95c731..e2a700ac 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -162,26 +162,28 @@ def matvec(mat, vec): """ -from typing import Any, Optional, Union +import re +from typing import Optional, Union import numpy as np import pandas as pd from scipy import sparse as sps from .ext.categorical import ( - matvec, - matvec_drop_first, - multiply_drop_first, - sandwich_categorical, - sandwich_categorical_drop_first, - subset_categorical_drop_first, - transpose_matvec, - transpose_matvec_drop_first, + matvec_complex, + matvec_fast, + multiply_complex, + sandwich_categorical_complex, + sandwich_categorical_fast, + subset_categorical_complex, + transpose_matvec_complex, + transpose_matvec_fast, ) from .ext.split import sandwich_cat_cat, sandwich_cat_dense from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -190,21 +192,15 @@ def matvec(mat, vec): ) -def _is_indexer_full_length(full_length: int, indexer: Any): - if isinstance(indexer, int): - return full_length == 1 - elif isinstance(indexer, list): - if (np.asarray(indexer) > full_length - 1).any(): - raise IndexError("Index out-of-range.") - return len(set(indexer)) == full_length - elif isinstance(indexer, np.ndarray): +def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]): + if isinstance(indexer, np.ndarray): if (indexer > full_length - 1).any(): raise IndexError("Index out-of-range.") - return len(np.unique(indexer)) == full_length + # Order is important in indexing. Could achieve similar results + # by rearranging categories. + return np.array_equal(indexer.ravel(), np.arange(full_length)) elif isinstance(indexer, slice): return len(range(*indexer.indices(full_length))) == full_length - else: - raise ValueError(f"Indexing with {type(indexer)} is not allowed.") def _row_col_indexing( @@ -242,6 +238,17 @@ class CategoricalMatrix(MatrixBase): drop the first level of the dummy encoding. This allows a CategoricalMatrix to be used in an unregularized setting. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + - if 'fail', raise an error if there are missing values. + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the ``cat_missing_name`` + category. + + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. If this category already exists, an error + will be raised. + dtype: data type """ @@ -251,28 +258,84 @@ def __init__( cat_vec: Union[list, np.ndarray, pd.Categorical], drop_first: bool = False, dtype: np.dtype = np.float64, + column_name: Optional[str] = None, + term_name: Optional[str] = None, + column_name_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", ): - if pd.isnull(cat_vec).any(): - raise ValueError("Categorical data can't have missing values.") + if cat_missing_method not in ["fail", "zero", "convert"]: + raise ValueError( + "cat_missing_method must be one of 'fail' 'zero' or 'convert', " + f" got {cat_missing_method}" + ) + self._missing_method = cat_missing_method + self._missing_category = cat_missing_name if isinstance(cat_vec, pd.Categorical): self.cat = cat_vec else: self.cat = pd.Categorical(cat_vec) + if pd.isnull(self.cat).any(): + if self._missing_method == "fail": + raise ValueError( + "Categorical data can't have missing values " + "if cat_missing_method='fail'." + ) + + elif self._missing_method == "convert": + if self._missing_category in self.cat.categories: + raise ValueError( + f"Missing category {self._missing_category} already exists." + ) + + self.cat = self.cat.add_categories([self._missing_category]) + + self.cat[pd.isnull(self.cat)] = self._missing_category + self._has_missings = False + + else: + self._has_missings = True + + else: + self._has_missings = False + self.drop_first = drop_first self.shape = (len(self.cat), len(self.cat.categories) - int(drop_first)) self.indices = self.cat.codes.astype(np.int32) self.x_csc: Optional[tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) + self._colname = column_name + if term_name is None: + self._term = self._colname + else: + self._term = term_name + self._colname_format = column_name_format + + __array_ufunc__ = None + def recover_orig(self) -> np.ndarray: """ Return 1d numpy array with same data as what was initially fed to __init__. Test: matrix/test_categorical_matrix::test_recover_orig """ - return self.cat.categories[self.cat.codes] + orig = self.cat.categories[self.cat.codes].to_numpy() + + if self._has_missings: + orig = orig.view(np.ma.MaskedArray) + orig.mask = self.cat.codes == -1 + elif ( + self._missing_method == "convert" + and self._missing_category in self.cat.categories + ): + orig = orig.view(np.ma.MaskedArray) + missing_code = self.cat.categories.get_loc(self._missing_category) + orig.mask = self.cat.codes == missing_code + + return orig def _matvec_setup( self, @@ -328,12 +391,18 @@ def matvec( if out is None: out = np.zeros(self.shape[0], dtype=other_m.dtype) - if self.drop_first: - matvec_drop_first( - self.indices, other_m, self.shape[0], cols, self.shape[1], out + if self.drop_first or self._has_missings: + matvec_complex( + self.indices, + other_m, + self.shape[0], + cols, + self.shape[1], + out, + self.drop_first, ) else: - matvec(self.indices, other_m, self.shape[0], cols, self.shape[1], out) + matvec_fast(self.indices, other_m, self.shape[0], cols, self.shape[1], out) if is_int: return out.astype(int) @@ -395,12 +464,19 @@ def transpose_matvec( if cols is not None: cols = set_up_rows_or_cols(cols, self.shape[1]) - if self.drop_first: - transpose_matvec_drop_first( - self.indices, vec, self.shape[1], vec.dtype, rows, cols, out + if self.drop_first or self._has_missings: + transpose_matvec_complex( + self.indices, + vec, + self.shape[1], + vec.dtype, + rows, + cols, + out, + self.drop_first, ) else: - transpose_matvec( + transpose_matvec_fast( self.indices, vec, self.shape[1], vec.dtype, rows, cols, out ) @@ -431,12 +507,12 @@ def sandwich( """ d = np.asarray(d) rows = set_up_rows_or_cols(rows, self.shape[0]) - if self.drop_first: - res_diag = sandwich_categorical_drop_first( - self.indices, d, rows, d.dtype, self.shape[1] + if self.drop_first or self._has_missings: + res_diag = sandwich_categorical_complex( + self.indices, d, rows, d.dtype, self.shape[1], self.drop_first ) else: - res_diag = sandwich_categorical( + res_diag = sandwich_categorical_fast( self.indices, d, rows, d.dtype, self.shape[1] ) @@ -453,31 +529,39 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ Y.""" - if isinstance(other, np.ndarray): - return self._cross_dense(other, d, rows, L_cols, R_cols) - if isinstance(other, sps.csc_matrix): - return self._cross_sparse(other, d, rows, L_cols, R_cols) + from .dense_matrix import DenseMatrix + + if isinstance(other, DenseMatrix): + return self._cross_dense(other._array, d, rows, L_cols, R_cols) + if isinstance(other, SparseMatrix): + return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return self._cross_categorical(other, d, rows, L_cols, R_cols) raise TypeError # TODO: best way to return this depends on the use case. See what that is # See how csr getcol works - def getcol(self, i: int) -> sps.csc_matrix: + def getcol(self, i: int) -> SparseMatrix: """Return matrix column at specified index.""" i %= self.shape[1] # wrap-around indexing if self.drop_first: - i += 1 + i_corr = i + 1 + else: + i_corr = i - col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None]) - return col_i + col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None]) + return SparseMatrix( + col_i, + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" - if self.drop_first: - nnz, indices, indptr = subset_categorical_drop_first( - self.indices, self.shape[1] + if self.drop_first or self._has_missings: + nnz, indices, indptr = subset_categorical_complex( + self.indices, self.shape[1], self.drop_first ) return sps.csr_matrix( (np.ones(nnz, dtype=int), indices, indptr), shape=self.shape @@ -490,10 +574,24 @@ def tocsr(self) -> sps.csr_matrix: shape=self.shape, ) + def to_sparse_matrix(self): + """Return a tabmat.SparseMatrix representation.""" + from .sparse_matrix import SparseMatrix + + return SparseMatrix( + self.tocsr(), + column_names=self.column_names, + term_names=self.term_names, + ) + def toarray(self) -> np.ndarray: """Return array representation of matrix.""" return self.tocsr().A + def unpack(self): + """Return the underlying pandas.Categorical.""" + return self.cat + def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return CategoricalMatrix cast to new type.""" self.dtype = dtype @@ -509,25 +607,23 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra return np.sqrt(mean - col_means**2) def __getitem__(self, item): - if isinstance(item, tuple): - row, col = item - if _is_indexer_full_length(self.shape[1], col): - if isinstance(row, int): - row = [row] - return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype - ) - else: - # return a SparseMatrix if we subset columns - # TODO: this is inefficient. See issue #101. - return SparseMatrix(self.tocsr()[row, col], dtype=self.dtype) + row, col = _check_indexer(item) + + if _is_indexer_full_length(self.shape[1], col): + if isinstance(row, np.ndarray): + row = row.ravel() + return CategoricalMatrix( + self.cat[row], + drop_first=self.drop_first, + dtype=self.dtype, + column_name=self._colname, + column_name_format=self._colname_format, + cat_missing_method=self._missing_method, + ) else: - row = item - if isinstance(row, int): - row = [row] - return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype - ) + # return a SparseMatrix if we subset columns + # TODO: this is inefficient. See issue #101. + return self.to_sparse_matrix()[row, col] def _cross_dense( self, @@ -550,15 +646,17 @@ def _cross_dense( res = sandwich_cat_dense( self.indices, - self.shape[1] + self.drop_first, + self.shape[1], d, other, rows, R_cols, is_c_contiguous, + has_missings=self._has_missings, + drop_first=self.drop_first, ) - res = _row_col_indexing(res[self.drop_first :], L_cols, None) + res = _row_col_indexing(res, L_cols, None) return res def _cross_categorical( @@ -586,6 +684,8 @@ def _cross_categorical( d.dtype, self.drop_first, other.drop_first, + self._has_missings, + other._has_missings, ) res = _row_col_indexing(res, L_cols, R_cols) @@ -617,14 +717,15 @@ def multiply(self, other) -> SparseMatrix: f"{len(other)}." ) - if self.drop_first: + if self.drop_first or self._has_missings: return SparseMatrix( sps.csr_matrix( - multiply_drop_first( + multiply_complex( indices=self.indices, d=np.squeeze(other), ncols=self.shape[1], dtype=other.dtype, + drop_first=self.drop_first, ), shape=self.shape, ) @@ -638,8 +739,111 @@ def multiply(self, other) -> SparseMatrix: np.arange(self.shape[0] + 1, dtype=int), ), shape=self.shape, - ) + ), + column_names=self.column_names, + term_names=self.term_names, ) def __repr__(self): return str(self.cat) + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[list[int]] = None, + ) -> list[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + following pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + list[Optional[str]] + Column names. + """ + if type == "column": + name = self._colname + elif type == "term": + name = self._term + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + + if indices is None: + indices = list(range(len(self.cat.categories) - self.drop_first)) + if name is None and missing_prefix is None: + return [None] * (len(self.cat.categories) - self.drop_first) + elif name is None: + name = f"{missing_prefix}{indices[0]}-{indices[-1]}" + + if type == "column": + return [ + self._colname_format.format(name=name, category=cat) + for cat in self.cat.categories[self.drop_first :] + ] + else: + return [name] * (len(self.cat.categories) - self.drop_first) + + def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: list[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != 1: + if type == "column": + # Try finding the column name + base_names = [] + for name, cat in zip(names, self.cat.categories[self.drop_first :]): + partial_name = self._colname_format.format( + name="__CAPTURE__", category=cat + ) + pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)") + if name is not None: + match = re.search(pattern, name) + else: + match = None + if match is not None: + base_names.append(match.group(1)) + else: + base_names.append(name) + names = base_names + + if len(names) == self.shape[1] and all(name == names[0] for name in names): + names = [names[0]] + + if len(names) != 1: + raise ValueError("A categorical matrix has only one name") + + if type == "column": + self._colname = names[0] + elif type == "term": + self._term = names[0] + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index a6b40efb..ae7fff5e 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,13 +1,21 @@ +import sys import warnings -from typing import Union +from collections.abc import Mapping +from typing import Any, Optional, Union import numpy as np import pandas as pd +from formulaic import Formula, ModelSpec +from formulaic.materializers.types import NAAction +from formulaic.parser import DefaultFormulaParser +from formulaic.utils.layered_mapping import LayeredMapping from pandas.api.types import is_bool_dtype, is_numeric_dtype from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix +from .constructor_util import _split_sparse_and_dense_parts from .dense_matrix import DenseMatrix +from .formula import TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -21,6 +29,9 @@ def from_pandas( object_as_cat: bool = False, cat_position: str = "expand", drop_first: bool = False, + categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", ) -> MatrixBase: """ Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this @@ -50,6 +61,14 @@ def from_pandas( If true, categoricals variables will have their first category dropped. This allows multiple categorical variables to be included in an unregularized model. If False, all categories are included. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values. + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. Returns ------- @@ -72,7 +91,16 @@ def from_pandas( if object_as_cat and coldata.dtype == object: coldata = coldata.astype("category") if isinstance(coldata.dtype, pd.CategoricalDtype): - cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) + cat = CategoricalMatrix( + coldata, + drop_first=drop_first, + dtype=dtype, + column_name=colname, + term_name=colname, + column_name_format=categorical_format, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) if len(coldata.cat.categories) < cat_threshold: ( X_dense_F, @@ -82,6 +110,8 @@ def from_pandas( ) = _split_sparse_and_dense_parts( sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, + column_names=cat.get_names("column"), + term_names=cat.get_names("term"), ) matrices.append(X_dense_F) is_cat.append(True) @@ -129,13 +159,26 @@ def from_pandas( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) if len(dense_dfidx) > 0: - matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype))) + matrices.append( + DenseMatrix( + df.iloc[:, dense_dfidx].astype(dtype), + column_names=df.columns[dense_dfidx], + term_names=df.columns[dense_dfidx], + ) + ) indices.append(dense_mxidx) is_cat.append(False) if len(sparse_dfcols) > 0: sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)} full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo() - matrices.append(SparseMatrix(full_sparse, dtype=dtype)) + matrices.append( + SparseMatrix( + full_sparse, + dtype=dtype, + column_names=[col.name for col in sparse_dfcols], + term_names=[col.name for col in sparse_dfcols], + ) + ) indices.append(sparse_mxidx) is_cat.append(False) @@ -157,32 +200,7 @@ def from_pandas( return matrices[0] -def _split_sparse_and_dense_parts( - arg1: sps.csc_matrix, threshold: float = 0.1 -) -> tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: - """ - Split matrix. - - Return the dense and sparse parts of a matrix and the corresponding indices - for each at the provided threshold. - """ - if not isinstance(arg1, sps.csc_matrix): - raise TypeError( - f"X must be of type scipy.sparse.csc_matrix or matrix.SparseMatrix," - f"not {type(arg1)}" - ) - if not 0 <= threshold <= 1: - raise ValueError("Threshold must be between 0 and 1.") - densities = np.diff(arg1.indptr) / arg1.shape[0] - dense_indices = np.where(densities > threshold)[0] - sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) - - X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray())) - X_sparse = SparseMatrix(arg1[:, sparse_indices]) - return X_dense_F, X_sparse, dense_indices, sparse_indices - - -def from_csc(mat: sps.csc_matrix, threshold=0.1): +def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``. @@ -191,3 +209,105 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1): """ dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold) return SplitMatrix([dense, sparse], [dense_idx, sparse_idx]) + + +def from_formula( + formula: Union[str, Formula], + data: pd.DataFrame, + ensure_full_rank: bool = False, + na_action: Union[str, NAAction] = NAAction.IGNORE, + dtype: np.dtype = np.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + interaction_separator: str = ":", + categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", + intercept_name: str = "Intercept", + include_intercept: bool = False, + add_column_for_intercept: bool = True, + context: Optional[Union[int, Mapping[str, Any]]] = None, +) -> SplitMatrix: + """ + Transform a pandas data frame to a SplitMatrix using a Wilkinson formula. + + Parameters + ---------- + formula: str + A formula accepted by formulaic. + data: pd.DataFrame + pandas data frame to be converted. + ensure_full_rank: bool, default False + If True, ensure that the matrix has full structural rank by categories. + na_action: Union[str, NAAction], default NAAction.IGNORE + How to handle missing values. Can be one of "drop", "ignore", "raise". + dtype: np.dtype, default np.float64 + The dtype of the resulting matrix. + sparse_threshold: float, default 0.1 + The density below which a column is treated as sparse. + cat_threshold: int, default 4 + The number of categories below which a categorical column is one-hot + encoded. This is only checked after interactions have been applied. + interaction_separator: str, default ":" + The separator between the names of interacted variables. + categorical_format: str, default "{name}[T.{category}]" + The format string used to generate the names of categorical variables. + Has to include the placeholders ``{name}`` and ``{category}``. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values. + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. + intercept_name: str, default "Intercept" + The name of the intercept column. + include_intercept: bool, default False + Whether to include an intercept term if the formula does not + include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly. + add_column_for_intercept: bool, default = True + Whether to add a column of ones for the intercept, or just + have a term without a corresponding column. For advanced use only. + context: Optional[Union[int, Mapping[str, Any]]], default = None + The context to add to the evaluation context of the formula with, + e.g., custom transforms. If an integer, the context is taken from + the stack frame of the caller at the given depth. Otherwise, a + mapping from variable names to values is expected. By default, + no context is added. Set ``context=0`` to make the calling scope + available. + """ + if isinstance(context, int): + if hasattr(sys, "_getframe"): + frame = sys._getframe(context + 1) + context = LayeredMapping(frame.f_locals, frame.f_globals) + else: + context = None + spec = ModelSpec( + formula=Formula( + formula, _parser=DefaultFormulaParser(include_intercept=include_intercept) + ), + ensure_full_rank=ensure_full_rank, + na_action=na_action, + ) + materializer = TabmatMaterializer( + data, + context=context, + interaction_separator=interaction_separator, + categorical_format=categorical_format, + intercept_name=intercept_name, + dtype=dtype, + sparse_threshold=sparse_threshold, + cat_threshold=cat_threshold, + add_column_for_intercept=add_column_for_intercept, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + result = materializer.get_model_matrix(spec) + + term_names = np.zeros(len(result.term_names), dtype="object") + for term, indices in result.model_spec.term_indices.items(): + term_names[indices] = str(term) + result.term_names = term_names.tolist() + + return result diff --git a/src/tabmat/constructor_util.py b/src/tabmat/constructor_util.py new file mode 100644 index 00000000..772baeb0 --- /dev/null +++ b/src/tabmat/constructor_util.py @@ -0,0 +1,49 @@ +from collections.abc import Sequence +from typing import Optional + +import numpy as np +import scipy.sparse as sps + +from .dense_matrix import DenseMatrix +from .sparse_matrix import SparseMatrix + + +def _split_sparse_and_dense_parts( + arg1: sps.csc_matrix, + threshold: float = 0.1, + column_names: Optional[Sequence[Optional[str]]] = None, + term_names: Optional[Sequence[Optional[str]]] = None, +) -> tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: + """ + Split matrix. + + Return the dense and sparse parts of a matrix and the corresponding indices + for each at the provided threshold. + """ + if not isinstance(arg1, sps.csc_matrix): + raise TypeError( + f"X must be of type scipy.sparse.csc_matrix or matrix.SparseMatrix," + f"not {type(arg1)}" + ) + if not 0 <= threshold <= 1: + raise ValueError("Threshold must be between 0 and 1.") + densities = np.diff(arg1.indptr) / arg1.shape[0] + dense_indices = np.where(densities > threshold)[0] + sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) + + if column_names is None: + column_names = [None] * arg1.shape[1] + if term_names is None: + term_names = column_names + + X_dense_F = DenseMatrix( + np.asfortranarray(arg1[:, dense_indices].toarray()), + column_names=[column_names[i] for i in dense_indices], + term_names=[term_names[i] for i in dense_indices], + ) + X_sparse = SparseMatrix( + arg1[:, sparse_indices], + column_names=[column_names[i] for i in sparse_indices], + term_names=[term_names[i] for i in sparse_indices], + ) + return X_dense_F, X_sparse, dense_indices, sparse_indices diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index fe3290f1..5dc6a156 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,3 +1,4 @@ +import textwrap from typing import Optional, Union import numpy as np @@ -10,6 +11,7 @@ ) from .matrix_base import MatrixBase from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -17,7 +19,7 @@ ) -class DenseMatrix(np.ndarray, MatrixBase): +class DenseMatrix(MatrixBase): """ A ``numpy.ndarray`` subclass with several additional functions that allow it to share the MatrixBase API with SparseMatrix and CategoricalMatrix. @@ -32,29 +34,106 @@ class DenseMatrix(np.ndarray, MatrixBase): """ - def __new__(cls, input_array): # noqa - """ - Details of how to subclass np.ndarray are explained here: + def __init__(self, input_array, column_names=None, term_names=None): + input_array = np.asarray(input_array) - https://docs.scipy.org/doc/numpy/user/basics.subclassing.html\ - #slightly-more-realistic-example-attribute-added-to-existing-array - """ - obj = np.asarray(input_array).view(cls) - if not np.issubdtype(obj.dtype, np.floating): - raise NotImplementedError("DenseMatrix is only implemented for float data") - return obj + if input_array.ndim == 1: + input_array = input_array.reshape(-1, 1) + elif input_array.ndim > 2: + raise ValueError("Input array must be 1- or 2-dimensional") + + self._array = np.asarray(input_array) + width = self._array.shape[1] + + if column_names is not None: + if len(column_names) != width: + raise ValueError( + f"Expected {width} column names, got {len(column_names)}" + ) + self._colnames = column_names + else: + self._colnames = [None] * width + + if term_names is not None: + if len(term_names) != width: + raise ValueError(f"Expected {width} term names, got {len(term_names)}") + self._terms = term_names + else: + self._terms = self._colnames + + def __getitem__(self, key): + row, col = _check_indexer(key) + colnames = list(np.array(self.column_names)[col].ravel()) + terms = list(np.array(self.term_names)[col].ravel()) + + return type(self)( + self._array.__getitem__((row, col)), column_names=colnames, term_names=terms + ) + + __array_ufunc__ = None + + def __matmul__(self, other): + return self._array.__matmul__(other) + + def __rmatmul__(self, other): + return self._array.__rmatmul__(other) + + def __str__(self): + return "{}x{} DenseMatrix:\n\n".format(*self.shape) + np.array_str(self._array) + + def __repr__(self): + class_name = type(self).__name__ + array_str = f"{class_name}({np.array2string(self._array, separator=', ')})" + return textwrap.indent( + array_str, + " " * (len(class_name) + 1), + predicate=lambda line: not line.startswith(class_name), + ) + + @property + def shape(self): + """Tuple of array dimensions.""" + return self._array.shape + + @property + def ndim(self): + """Number of array dimensions.""" # noqa: D401 + return self._array.ndim + + @property + def dtype(self): + """Data-type of the array’s elements.""" # noqa: D401 + return self._array.dtype + + def transpose(self): + """Returns a view of the array with axes transposed.""" # noqa: D401 + return type(self)(self._array.T) - def __array_finalize__(self, obj): - if obj is None: - return + T = property(transpose) + + def astype(self, dtype, order="K", casting="unsafe", copy=True): + """Copy of the array, cast to a specified type.""" + return type(self)( + self._array.astype(dtype, order, casting, copy), + column_names=self.column_names, + term_names=self.term_names, + ) def getcol(self, i): """Return matrix column at specified index.""" - return self[:, [i]] + return type(self)( + self._array[:, [i]], + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def toarray(self): """Return array representation of matrix.""" - return np.asarray(self) + return self._array + + def unpack(self): + """Return the underlying numpy.ndarray.""" + return self._array def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None @@ -62,7 +141,7 @@ def sandwich( """Perform a sandwich product: X.T @ diag(d) @ X.""" d = np.asarray(d) rows, cols = setup_restrictions(self.shape, rows, cols) - return dense_sandwich(self, d, rows, cols) + return dense_sandwich(self._array, d, rows, cols) def _cross_sandwich( self, @@ -81,7 +160,7 @@ def _cross_sandwich( def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: """Get standard deviations of columns.""" - sqrt_arg = transpose_square_dot_weights(self, weights) - col_means**2 + sqrt_arg = transpose_square_dot_weights(self._array, weights) - col_means**2 # Minor floating point errors above can result in a very slightly # negative sqrt_arg (e.g. -5e-16). We just set those values equal to # zero. @@ -105,7 +184,7 @@ def _matvec_helper( # this without an explosion of code? vec = np.asarray(vec) check_matvec_dimensions(self, vec, transpose=transpose) - X = self.T if transpose else self + X = self._array.T if transpose else self._array # NOTE: We assume that rows and cols are unique unrestricted_rows = rows is None or len(rows) == self.shape[0] @@ -122,11 +201,11 @@ def _matvec_helper( # TODO: should take 'out' parameter fast_fnc = dense_rmatvec if transpose else dense_matvec if vec.ndim == 1: - res = fast_fnc(self, vec, rows, cols) + res = fast_fnc(self._array, vec, rows, cols) elif vec.ndim == 2 and vec.shape[1] == 1: - res = fast_fnc(self, vec[:, 0], rows, cols)[:, None] + res = fast_fnc(self._array, vec[:, 0], rows, cols)[:, None] else: - subset = self[np.ix_(rows, cols)] + subset = self._array[np.ix_(rows, cols)] res = subset.T.dot(vec[rows]) if transpose else subset.dot(vec[cols]) if out is None: return res @@ -164,5 +243,86 @@ def multiply(self, other): This assumes that ``other`` is a vector of size ``self.shape[0]``. """ if np.asanyarray(other).ndim == 1: - return super().__mul__(other[:, np.newaxis]) - return super().__mul__(other) + return type(self)( + self._array.__mul__(other[:, np.newaxis]), + column_names=self.column_names, + term_names=self.term_names, + ) + return type(self)( + self._array.__mul__(other), + column_names=self.column_names, + term_names=self.term_names, + ) + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[list[int]] = None, + ) -> list[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + following pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + list[Optional[str]] + Column names. + """ + if type == "column": + names = np.array(self._colnames) + elif type == "term": + names = np.array(self._terms) + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + + if indices is None: + indices = list(range(len(self._colnames))) + + if missing_prefix is not None: + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + + return list(names) + + def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: list[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + if type == "column": + self._colnames = names + elif type == "term": + self._terms = names + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/ext/cat_split_helpers-tmpl.cpp b/src/tabmat/ext/cat_split_helpers-tmpl.cpp index d70960b5..df7a21c6 100644 --- a/src/tabmat/ext/cat_split_helpers-tmpl.cpp +++ b/src/tabmat/ext/cat_split_helpers-tmpl.cpp @@ -1,26 +1,29 @@ #include #include -<%def name="transpose_matvec(dropfirst)"> +<%def name="transpose_matvec(type)"> template -void _transpose_matvec_${dropfirst}( +void _transpose_matvec_${type}( Int n_rows, Int* indices, F* other, F* res, Int res_size + % if type == 'all_rows_complex': + , bool drop_first + % endif ) { int num_threads = omp_get_max_threads(); std::vector all_res(num_threads * res_size, 0.0); - #pragma omp parallel shared(all_res) + #pragma omp parallel shared(all_res) { int tid = omp_get_thread_num(); - F* res_slice = &all_res[tid * res_size]; + F* res_slice = &all_res[tid * res_size]; #pragma omp for for (Py_ssize_t i = 0; i < n_rows; i++) { - % if dropfirst == 'all_rows_drop_first': - Py_ssize_t col_idx = indices[i] - 1; - if (col_idx != -1) { + % if type == 'all_rows_complex': + Py_ssize_t col_idx = indices[i] - drop_first; + if (col_idx >= 0) { res_slice[col_idx] += other[i]; } % else: @@ -30,16 +33,17 @@ void _transpose_matvec_${dropfirst}( #pragma omp for for (Py_ssize_t i = 0; i < res_size; ++i) { for (int tid = 0; tid < num_threads; ++tid) { - res[i] += all_res[tid * res_size + i]; + res[i] += all_res[tid * res_size + i]; } - } + } } } +<%def name="sandwich_cat_cat(type)"> template -void _sandwich_cat_cat( +void _sandwich_cat_cat_${type}( F* d, const Int* i_indices, const Int* j_indices, @@ -47,9 +51,11 @@ void _sandwich_cat_cat( Int len_rows, F* res, Int res_n_col, - Int res_size, - bool i_drop_first, - bool j_drop_first + Int res_size + % if type == 'complex': + , bool i_drop_first + , bool j_drop_first + % endif ) { #pragma omp parallel @@ -58,14 +64,25 @@ void _sandwich_cat_cat( # pragma omp for for (Py_ssize_t k_idx = 0; k_idx < len_rows; k_idx++) { Int k = rows[k_idx]; - Int i = i_indices[k] - i_drop_first; - if (i == -1) { - continue; - } - Int j = j_indices[k] - j_drop_first; - if (j == -1) { - continue; - } + + % if type == 'complex': + Int i = i_indices[k] - i_drop_first; + if (i < 0) { + continue; + } + % else: + Int i = i_indices[k]; + % endif + + % if type == 'complex': + Int j = j_indices[k] - j_drop_first; + if (j < 0) { + continue; + } + % else: + Int j = j_indices[k]; + % endif + restemp[(Py_ssize_t) i * res_n_col + j] += d[k]; } for (Py_ssize_t i = 0; i < res_size; i++) { @@ -74,11 +91,12 @@ void _sandwich_cat_cat( } } } + -<%def name="sandwich_cat_dense_tmpl(order)"> +<%def name="sandwich_cat_dense_tmpl(order, type)"> template -void _sandwich_cat_dense${order}( +void _sandwich_cat_dense${order}_${type}( F* d, const Int* indices, Int* rows, @@ -90,6 +108,9 @@ void _sandwich_cat_dense${order}( F* mat_j, Int mat_j_nrow, Int mat_j_ncol + % if type == 'complex': + , bool drop_first + % endif ) { #pragma omp parallel @@ -98,20 +119,28 @@ void _sandwich_cat_dense${order}( #pragma omp for for (Py_ssize_t k_idx = 0; k_idx < len_rows; k_idx++) { Py_ssize_t k = rows[k_idx]; - Py_ssize_t i = indices[k]; // MAYBE TODO: explore whether the column restriction slows things down a // lot, particularly if not restricting the columns allows using SIMD // instructions // MAYBE TODO: explore whether swapping the loop order for F-ordered mat_j // is useful. - for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) { - Py_ssize_t j = j_cols[j_idx]; - % if order == 'C': - restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j]; - % else: - restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k]; - % endif - } + % if type == 'complex': + Py_ssize_t i = indices[k] - drop_first; + if (i >= 0) { + % else: + Py_ssize_t i = indices[k]; + % endif + for (Py_ssize_t j_idx = 0; j_idx < len_j_cols; j_idx++) { + Py_ssize_t j = j_cols[j_idx]; + % if order == 'C': + restemp[i * len_j_cols + j_idx] += d[k] * mat_j[k * mat_j_ncol + j]; + % else: + restemp[i * len_j_cols + j_idx] += d[k] * mat_j[j * mat_j_nrow + k]; + % endif + } + % if type == 'complex': + } + % endif } for (Py_ssize_t i = 0; i < res_size; i++) { #pragma omp atomic @@ -121,7 +150,11 @@ void _sandwich_cat_dense${order}( } -${sandwich_cat_dense_tmpl('C')} -${sandwich_cat_dense_tmpl('F')} -${transpose_matvec('all_rows')} -${transpose_matvec('all_rows_drop_first')} +${sandwich_cat_dense_tmpl('C', 'fast')} +${sandwich_cat_dense_tmpl('F', 'fast')} +${sandwich_cat_dense_tmpl('C', 'complex')} +${sandwich_cat_dense_tmpl('F', 'complex')} +${sandwich_cat_cat('fast')} +${sandwich_cat_cat('complex')} +${transpose_matvec('all_rows_fast')} +${transpose_matvec('all_rows_complex')} diff --git a/src/tabmat/ext/categorical.pyx b/src/tabmat/ext/categorical.pyx index 59ea308a..0c90b2c7 100644 --- a/src/tabmat/ext/categorical.pyx +++ b/src/tabmat/ext/categorical.pyx @@ -11,11 +11,11 @@ from libcpp cimport bool cdef extern from "cat_split_helpers.cpp": - void _transpose_matvec_all_rows[Int, F](Int, Int*, F*, F*, Int) - void _transpose_matvec_all_rows_drop_first[Int, F](Int, Int*, F*, F*, Int) + void _transpose_matvec_all_rows_fast[Int, F](Int, Int*, F*, F*, Int) + void _transpose_matvec_all_rows_complex[Int, F](Int, Int*, F*, F*, Int, bool) -def transpose_matvec( +def transpose_matvec_fast( int[:] indices, floating[:] other, int n_cols, @@ -34,7 +34,7 @@ def transpose_matvec( # Case 1: No row or col restrictions if no_row_restrictions and no_col_restrictions: - _transpose_matvec_all_rows(n_rows, &indices[0], &other[0], &out[0], out_size) + _transpose_matvec_all_rows_fast(n_rows, &indices[0], &other[0], &out[0], out_size) # Case 2: row restrictions but no col restrictions elif no_col_restrictions: rows_view = rows @@ -62,14 +62,15 @@ def transpose_matvec( out[col] += other[row] -def transpose_matvec_drop_first( +def transpose_matvec_complex( int[:] indices, floating[:] other, int n_cols, dtype, rows, cols, - floating[:] out + floating[:] out, + bint drop_first ): cdef int row, row_idx, n_keep_rows, col_idx cdef int n_rows = len(indices) @@ -81,15 +82,15 @@ def transpose_matvec_drop_first( # Case 1: No row or col restrictions if no_row_restrictions and no_col_restrictions: - _transpose_matvec_all_rows_drop_first(n_rows, &indices[0], &other[0], &out[0], out_size) + _transpose_matvec_all_rows_complex(n_rows, &indices[0], &other[0], &out[0], out_size, drop_first) # Case 2: row restrictions but no col restrictions elif no_col_restrictions: rows_view = rows n_keep_rows = len(rows_view) for row_idx in range(n_keep_rows): row = rows_view[row_idx] - col_idx = indices[row] - 1 - if col_idx != -1: + col_idx = indices[row] - drop_first + if col_idx >= 0: out[col_idx] += other[row] # Cases 3 and 4: col restrictions else: @@ -97,8 +98,8 @@ def transpose_matvec_drop_first( # Case 3: Col restrictions but no row restrictions if no_row_restrictions: for row_idx in range(n_rows): - col_idx = indices[row_idx] - 1 - if (col_idx != -1) and (cols_included[col_idx]): + col_idx = indices[row_idx] - drop_first + if (col_idx >= 0) and (cols_included[col_idx]): out[col_idx] += other[row_idx] # Case 4: Both col restrictions and row restrictions else: @@ -106,8 +107,8 @@ def transpose_matvec_drop_first( n_keep_rows = len(rows_view) for row_idx in range(n_keep_rows): row = rows_view[row_idx] - col_idx = indices[row] - 1 - if (col_idx != -1) and (cols_included[col_idx]): + col_idx = indices[row] - drop_first + if (col_idx >= 0) and (cols_included[col_idx]): out[col_idx] += other[row] @@ -119,7 +120,7 @@ def get_col_included(int[:] cols, int n_cols): return col_included -def matvec( +def matvec_fast( const int[:] indices, floating[:] other, int n_rows, @@ -145,13 +146,14 @@ def matvec( return -def matvec_drop_first( +def matvec_complex( const int[:] indices, floating[:] other, int n_rows, int[:] cols, int n_cols, - floating[:] out_vec + floating[:] out_vec, + bint drop_first ): """See `matvec`. Here we drop the first category of the CategoricalMatrix so the indices refer to the column index + 1. @@ -161,19 +163,19 @@ def matvec_drop_first( if cols is None: for i in prange(n_rows, nogil=True): - col_idx = indices[i] - 1 # reference category is always 0. - if col_idx != -1: + col_idx = indices[i] - drop_first # reference category is always 0. + if col_idx >= 0: out_vec[i] += other[col_idx] else: col_included = get_col_included(cols, n_cols) for i in prange(n_rows, nogil=True): - col_idx = indices[i] - 1 - if (col_idx != -1) and (col_included[col_idx] == 1): + col_idx = indices[i] - drop_first + if (col_idx >= 0) and (col_included[col_idx] == 1): out_vec[i] += other[col_idx] return -def sandwich_categorical( +def sandwich_categorical_fast( const int[:] indices, floating[:] d, int[:] rows, @@ -191,12 +193,13 @@ def sandwich_categorical( return np.asarray(res) -def sandwich_categorical_drop_first( +def sandwich_categorical_complex( const int[:] indices, floating[:] d, int[:] rows, dtype, - int n_cols + int n_cols, + bint drop_first ): cdef floating[:] res = np.zeros(n_cols, dtype=dtype) cdef int col_idx, k, k_idx @@ -204,26 +207,24 @@ def sandwich_categorical_drop_first( for k_idx in range(n_rows): k = rows[k_idx] - col_idx = indices[k] - 1 # reference category is always 0. - if col_idx != -1: + col_idx = indices[k] - drop_first # reference category is always 0. + if col_idx >= 0: res[col_idx] += d[k] return np.asarray(res) -def multiply_drop_first( +def multiply_complex( int[:] indices, numeric[:] d, int ncols, dtype, + bint drop_first, ): """Multiply a CategoricalMatrix by a vector d. The output cannot be a CategoricalMatrix anymore. Here we return the inputs to transform to a csr_matrix. - Note that *_drop_first function assume the CategoricalMatrix - has its first category dropped. - Parameters ---------- indices: @@ -255,9 +256,9 @@ def multiply_drop_first( for i in range(nrows): vnew_indptr[i] = nonref_cnt - if indices[i] != 0: + if indices[i] >= drop_first: vnew_data[nonref_cnt] = d[i] - vnew_indices[nonref_cnt] = indices[i] - 1 + vnew_indices[nonref_cnt] = indices[i] - drop_first nonref_cnt += 1 vnew_indptr[i+1] = nonref_cnt @@ -265,9 +266,10 @@ def multiply_drop_first( return new_data[:nonref_cnt], new_indices[:nonref_cnt], new_indptr -def subset_categorical_drop_first( +def subset_categorical_complex( int[:] indices, int ncols, + bint drop_first ): """Construct the inputs to transform a CategoricalMatrix into a csr_matrix. @@ -299,8 +301,8 @@ def subset_categorical_drop_first( for i in range(nrows): vnew_indptr[i] = nonzero_cnt - if indices[i] != 0: - vnew_indices[nonzero_cnt] = indices[i] - 1 + if indices[i] >= drop_first: + vnew_indices[nonzero_cnt] = indices[i] - drop_first nonzero_cnt += 1 vnew_indptr[i+1] = nonzero_cnt diff --git a/src/tabmat/ext/split.pyx b/src/tabmat/ext/split.pyx index 11f5d3c0..21d2f52e 100644 --- a/src/tabmat/ext/split.pyx +++ b/src/tabmat/ext/split.pyx @@ -21,9 +21,12 @@ ctypedef fused win_integral: cdef extern from "cat_split_helpers.cpp": - void _sandwich_cat_denseC[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil - void _sandwich_cat_denseF[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil - void _sandwich_cat_cat[Int, F](F*, const Int*, const Int*, Int*, Int, F*, Int, Int, bool, bool) + void _sandwich_cat_denseC_fast[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil + void _sandwich_cat_denseF_fast[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int) nogil + void _sandwich_cat_denseC_complex[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int, bool) nogil + void _sandwich_cat_denseF_complex[Int, F](F*, Int*, Int*, Int, Int*, Int, F*, Int, F*, Int, Int, bool) nogil + void _sandwich_cat_cat_fast[Int, F](F*, const Int*, const Int*, Int*, Int, F*, Int, Int) + void _sandwich_cat_cat_complex[Int, F](F*, const Int*, const Int*, Int*, Int, F*, Int, Int, bool, bool) def sandwich_cat_dense( @@ -33,7 +36,9 @@ def sandwich_cat_dense( np.ndarray mat_j, int[:] rows, int[:] j_cols, - bool is_c_contiguous + bool is_c_contiguous, + bool has_missings, + bint drop_first ): cdef int nj_rows = mat_j.shape[0] cdef int nj_cols = mat_j.shape[1] @@ -53,14 +58,24 @@ def sandwich_cat_dense( cdef floating* mat_j_p = mat_j.data - if is_c_contiguous: - _sandwich_cat_denseC(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, - nj_active_cols, &res[0, 0], res_size, mat_j_p, - nj_rows, nj_cols) + if (not drop_first) and (not has_missings): + if is_c_contiguous: + _sandwich_cat_denseC_fast(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols) + else: + _sandwich_cat_denseF_fast(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols) else: - _sandwich_cat_denseF(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, - nj_active_cols, &res[0, 0], res_size, mat_j_p, - nj_rows, nj_cols) + if is_c_contiguous: + _sandwich_cat_denseC_complex(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols, drop_first) + else: + _sandwich_cat_denseF_complex(d_p, i_indices_p, rows_p, n_active_rows, j_cols_p, + nj_active_cols, &res[0, 0], res_size, mat_j_p, + nj_rows, nj_cols, drop_first) return np.asarray(res) @@ -74,7 +89,9 @@ def sandwich_cat_cat( int[:] rows, dtype, bint i_drop_first, - bint j_drop_first + bint j_drop_first, + bool i_has_missings, + bool j_has_missings ): """ (X1.T @ diag(d) @ X2)[i, j] = sum_k X1[k, i] d[k] X2[k, j] @@ -83,8 +100,13 @@ def sandwich_cat_cat( cdef int n_rows = len(rows) cdef int res_size = res.size - _sandwich_cat_cat(&d[0], &i_indices[0], &j_indices[0], &rows[0], n_rows, - &res[0, 0], j_ncol, res_size, i_drop_first, j_drop_first) + if i_drop_first or j_drop_first or i_has_missings or j_has_missings: + _sandwich_cat_cat_complex(&d[0], &i_indices[0], &j_indices[0], &rows[0], + n_rows, &res[0, 0], j_ncol, res_size, + i_drop_first, j_drop_first) + else: + _sandwich_cat_cat_fast(&d[0], &i_indices[0], &j_indices[0], &rows[0], + n_rows, &res[0, 0], j_ncol, res_size) return np.asarray(res) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py new file mode 100644 index 00000000..0afe6df1 --- /dev/null +++ b/src/tabmat/formula.py @@ -0,0 +1,783 @@ +import functools +import itertools +from abc import ABC, abstractmethod +from collections import OrderedDict +from collections.abc import Iterable +from typing import Any, Optional, Union + +import numpy +import pandas +from formulaic import ModelMatrix, ModelSpec +from formulaic.errors import FactorEncodingError +from formulaic.materializers import FormulaMaterializer +from formulaic.materializers.base import EncodedTermStructure +from formulaic.materializers.types import FactorValues, NAAction, ScopedTerm +from formulaic.parser.types import Term +from formulaic.transforms import stateful_transform +from interface_meta import override +from scipy import sparse as sps + +from .categorical_matrix import CategoricalMatrix +from .constructor_util import _split_sparse_and_dense_parts +from .dense_matrix import DenseMatrix +from .matrix_base import MatrixBase +from .sparse_matrix import SparseMatrix +from .split_matrix import SplitMatrix + + +class TabmatMaterializer(FormulaMaterializer): + """Materializer for pandas input and tabmat output.""" + + REGISTER_NAME = "tabmat" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) + REGISTER_OUTPUTS = "tabmat" + + @override + def _init(self): + self.interaction_separator = self.params.get("interaction_separator", ":") + self.categorical_format = self.params.get( + "categorical_format", "{name}[{category}]" + ) + self.intercept_name = self.params.get("intercept_name", "Intercept") + self.dtype = self.params.get("dtype", numpy.float64) + self.sparse_threshold = self.params.get("sparse_threshold", 0.1) + self.cat_threshold = self.params.get("cat_threshold", 4) + self.add_column_for_intercept = self.params.get( + "add_column_for_intercept", True + ) + self.cat_missing_method = self.params.get("cat_missing_method", "fail") + self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)") + + # We can override formulaic's C() function here + self.context["C"] = _C + + @override + def _is_categorical(self, values): + if isinstance(values, (pandas.Series, pandas.Categorical)): + return values.dtype == object or isinstance( + values.dtype, pandas.CategoricalDtype + ) + return super()._is_categorical(values) + + @override + def _check_for_nulls(self, name, values, na_action, drop_rows): + if na_action is NAAction.IGNORE: + return + + if na_action is NAAction.RAISE: + if isinstance(values, pandas.Series) and values.isnull().values.any(): + raise ValueError(f"`{name}` contains null values after evaluation.") + + elif na_action is NAAction.DROP: + if isinstance(values, pandas.Series): + drop_rows.update(numpy.flatnonzero(values.isnull().values)) + + else: + raise ValueError( + f"Do not know how to interpret `na_action` = {repr(na_action)}." + ) + + @override + def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): + series = value * numpy.ones(self.nrows - len(drop_rows)) + return _InteractableDenseVector(series, name=self.intercept_name) + + @override + def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + if isinstance(values, pandas.Series): + values = values.to_numpy().astype(self.dtype) + if (values != 0).mean() <= self.sparse_threshold: + return _InteractableSparseVector(sps.csc_matrix(values[:, numpy.newaxis])) + else: + return _InteractableDenseVector(values) + + @override + def _encode_categorical( + self, values, metadata, encoder_state, spec, drop_rows, reduced_rank=False + ): + # We do not do any encoding here as it is handled by tabmat + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + return encode_contrasts( + values, + reduced_rank=reduced_rank, + missing_method=self.cat_missing_method, + missing_name=self.cat_missing_name, + _metadata=metadata, + _state=encoder_state, + _spec=spec, + ) + + @override + def _combine_columns(self, cols, spec, drop_rows): + # Special case no columns + if not cols: + values = numpy.empty((self.data.shape[0], 0), dtype=self.dtype) + return DenseMatrix(values) + + # Otherwise, concatenate columns into SplitMatrix + return SplitMatrix( + [ + col[1].to_tabmat( + self.dtype, + self.sparse_threshold, + self.cat_threshold, + ) + for col in cols + ] + ) + + # Have to override this because of column names + # (and possibly intercept later on) + @override + def _build_model_matrix(self, spec: ModelSpec, drop_rows): + # Step 0: Apply any requested column/term clustering + # This must happen before Step 1 otherwise the greedy rank reduction + # below would result in a different outcome than if the columns had + # always been in the generated order. + terms = self._cluster_terms(spec.formula, cluster_by=spec.cluster_by) + + # Step 1: Determine strategy to maintain full-rankness of output matrix + scoped_terms_for_terms = self._get_scoped_terms( + terms, + ensure_full_rank=spec.ensure_full_rank, + ) + + # Step 2: Generate the columns which will be collated + cols = [] + for term, scoped_terms in scoped_terms_for_terms: + scoped_cols = OrderedDict() + for scoped_term in scoped_terms: + if not scoped_term.factors: + if not self.add_column_for_intercept: + continue + scoped_cols[self.intercept_name] = ( + scoped_term.scale + * self._encode_constant(1, None, {}, spec, drop_rows) + ) + else: + scoped_cols.update( + self._get_columns_for_term( + [ + self._encode_evaled_factor( + scoped_factor.factor, + spec, + drop_rows, + reduced_rank=scoped_factor.reduced, + ) + for scoped_factor in scoped_term.factors + ], + spec=spec, + scale=scoped_term.scale, + ) + ) + cols.append((term, scoped_terms, scoped_cols)) + + # Step 3: Populate remaining model spec fields + if spec.structure: + cols = self._enforce_structure(cols, spec, drop_rows) + else: + # for term, scoped_terms, columns in spec.structure: + # expanded_columns = list( + # itertools.chain(colname_dict[col] for col in columns + # )) + # expanded_structure.append( + # EncodedTermStructure(term, scoped_terms, expanded_columns) + # ) + + spec = spec.update( + structure=[ + EncodedTermStructure( + term, + [st.copy(without_values=True) for st in scoped_terms], + # This is the only line that is different from the original: + list( + itertools.chain( + *(mat.get_names() for mat in scoped_cols.values()) + ) + ), + ) + for term, scoped_terms, scoped_cols in cols + ], + ) + + # Step 4: Collate factors into one ModelMatrix + return ModelMatrix( + self._combine_columns( + [ + (name, values) + for term, scoped_terms, scoped_cols in cols + for name, values in scoped_cols.items() + ], + spec=spec, + drop_rows=drop_rows, + ), + spec=spec, + ) + + @override + def _get_columns_for_term(self, factors, spec, scale=1): + """Assemble the columns for a model matrix given factors and a scale.""" + out = OrderedDict() + for reverse_product in itertools.product( + *(factor.items() for factor in reversed(factors)) + ): + product = reverse_product[::-1] + out[":".join(p[0] for p in product)] = scale * functools.reduce( + functools.partial(_interact, separator=self.interaction_separator), + ( + p[1].set_name(p[0], name_format=self.categorical_format) + for p in product + ), + ) + return out + + # Again, need a correction to handle categoricals properly + @override + def _enforce_structure( + self, + cols: list[tuple[Term, list[ScopedTerm], dict[str, Any]]], + spec, + drop_rows: set, + ): + assert len(cols) == len(spec.structure) + for i, col_spec in enumerate(cols): + scoped_cols = col_spec[2] + target_cols = spec.structure[i][2].copy() + + # Correction for categorical variables: + for name, col in scoped_cols.items(): + if isinstance(col, _InteractableCategoricalVector): + try: + _replace_sequence(target_cols, col.get_names(), name) + except ValueError: + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated columns that are " + "inconsistent with the specification: generated: " + f"{col.get_names()}, expecting: {target_cols}." + ) + + if len(scoped_cols) > len(target_cols): + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated too many columns compared to " + f"specification: generated {list(scoped_cols)}, expecting " + f"{target_cols}." + ) + if len(scoped_cols) < len(target_cols): + if len(scoped_cols) == 0: + col = self._encode_constant(0, None, None, spec, drop_rows) + elif len(scoped_cols) == 1: + col = tuple(scoped_cols.values())[0] + else: + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated insufficient columns " + "compared to specification: generated {list(scoped_cols)}, " + f"expecting {target_cols}." + ) + scoped_cols = {name: col for name in target_cols} + elif set(scoped_cols) != set(target_cols): + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated columns that are inconsistent " + "with specification: generated {list(scoped_cols)}, expecting " + f"{target_cols}." + ) + + yield ( + col_spec[0], + col_spec[1], + {col: scoped_cols[col] for col in target_cols}, + ) + + +class _InteractableVector(ABC): + """Abstract base class for interactable vectors, which are mostly thin + wrappers over numpy arrays, scipy sparse matrices and pandas categoricals. + """ + + name: Optional[str] + + @abstractmethod + def to_tabmat( + self, + dtype: numpy.dtype, + sparse_threshold: float, + cat_threshold: int, + ) -> MatrixBase: + """Convert to an actual tabmat matrix.""" + pass + + @abstractmethod + def get_names(self) -> list[str]: + """Return the names of the columns represented by this vector. + + Returns + ------- + List[str] + The names of the columns represented by this vector. + """ + pass + + @abstractmethod + def set_name(self, name, name_format): + """Set the name of the vector. + + Parameters + ---------- + name : str + The name to set. + name_format : str + The format string to use to format the name. Only used for + categoricals. Has to include the placeholders ``{name}`` + and ``{category}`` + + Returns + ------- + self + A reference to the vector itself. + """ + pass + + +class _InteractableDenseVector(_InteractableVector): + def __init__(self, values: numpy.ndarray, name: Optional[str] = None): + self.values = values + self.name = name + + def __rmul__(self, other): + if isinstance(other, (int, float)): + return _InteractableDenseVector( + values=self.values * other, + name=self.name, + ) + + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> Union[SparseMatrix, DenseMatrix]: + if (self.values != 0).mean() > sparse_threshold: + return DenseMatrix(self.values, column_names=[self.name]) + else: + # Columns can become sparser, but not denser through interactions + return SparseMatrix( + sps.csc_matrix(self.values[:, numpy.newaxis]), column_names=[self.name] + ) + + def get_names(self) -> list[str]: + if self.name is None: + raise RuntimeError("Name not set") + return [self.name] + + def set_name(self, name, name_format=None) -> "_InteractableDenseVector": + self.name = name + return self + + +class _InteractableSparseVector(_InteractableVector): + def __init__(self, values: sps.csc_matrix, name: Optional[str] = None): + self.values = values + self.name = name + + def __rmul__(self, other): + if isinstance(other, (int, float)): + return _InteractableSparseVector( + values=self.values * other, + name=self.name, + ) + + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> SparseMatrix: + return SparseMatrix(self.values, column_names=[self.name]) + + def get_names(self) -> list[str]: + if self.name is None: + raise RuntimeError("Name not set") + return [self.name] + + def set_name(self, name, name_format=None) -> "_InteractableSparseVector": + self.name = name + return self + + +class _InteractableCategoricalVector(_InteractableVector): + def __init__( + self, + codes: numpy.ndarray, + categories: list[str], + multipliers: numpy.ndarray, + name: Optional[str] = None, + ): + # sentinel values for codes: + # -1: missing + # -2: drop + self.codes = codes + self.categories = categories + self.multipliers = multipliers + self.name = name + + @classmethod + def from_categorical( + cls, + cat: pandas.Categorical, + reduced_rank: bool, + missing_method: str = "fail", + missing_name: str = "(MISSING)", + add_missing_category: bool = False, + ) -> "_InteractableCategoricalVector": + """Create an interactable categorical vector from a pandas categorical.""" + categories = list(cat.categories) + codes = cat.codes.copy().astype(numpy.int64) + + if reduced_rank: + codes[codes == 0] = -2 + codes[codes > 0] -= 1 + categories = categories[1:] + + if missing_method == "fail" and -1 in codes: + raise ValueError( + "Categorical data can't have missing values " + "if cat_missing_method='fail'." + ) + + if missing_method == "convert" and (-1 in codes or add_missing_category): + codes[codes == -1] = len(categories) + categories.append(missing_name) + + return cls( + codes=codes, + categories=categories, + multipliers=numpy.ones(len(cat.codes)), + ) + + def __rmul__(self, other): + if isinstance(other, (int, float)): + return _InteractableCategoricalVector( + categories=self.categories, + codes=self.codes, + multipliers=self.multipliers * other, + name=self.name, + ) + + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> Union[DenseMatrix, CategoricalMatrix, SplitMatrix]: + codes = self.codes.copy() + categories = self.categories.copy() + if -2 in self.codes: + codes[codes >= 0] += 1 + codes[codes == -2] = 0 + categories.insert(0, "__drop__") + drop_first = True + else: + drop_first = False + + cat = pandas.Categorical.from_codes( + codes=codes, + categories=categories, + ordered=False, + ) + + categorical_part = CategoricalMatrix( + cat, + drop_first=drop_first, + dtype=dtype, + column_name=self.name, + column_name_format="{category}", + cat_missing_method="zero", # missing values are already handled + ) + + if (self.codes == -2).all(): + # All values are dropped + return DenseMatrix(numpy.empty((len(codes), 0), dtype=dtype)) + elif (self.multipliers == 1).all() and len(categories) >= cat_threshold: + return categorical_part + else: + sparse_matrix = sps.csc_matrix( + categorical_part.tocsr().multiply(self.multipliers[:, numpy.newaxis]) + ) + ( + dense_part, + sparse_part, + dense_idx, + sparse_idx, + ) = _split_sparse_and_dense_parts( + sparse_matrix, + sparse_threshold, + column_names=categorical_part.column_names, + ) + return SplitMatrix([dense_part, sparse_part], [dense_idx, sparse_idx]) + + def get_names(self) -> list[str]: + if self.name is None: + raise RuntimeError("Name not set") + return self.categories + + def set_name( + self, name, name_format="{name}[{category}]" + ) -> "_InteractableCategoricalVector": + if self.name is None: + # Make sure to only format the name once + self.name = name + self.categories = [ + name_format.format(name=name, category=cat) for cat in self.categories + ] + return self + + +def _interact( + left: _InteractableVector, right: _InteractableVector, reverse=False, separator=":" +) -> _InteractableVector: + """Interact two interactable vectors. + + Parameters + ---------- + left : _InteractableVector + The left vector. + right : _InteractableVector + The right vector. + reverse : bool, optional + Whether to reverse the order of the interaction, by default False + separator : str, optional + The separator to use between the names of the interacted vectors, by default ":" + + Returns + ------- + _InteractableVector + The interacted vector. + """ + if isinstance(left, _InteractableDenseVector): + if isinstance(right, _InteractableDenseVector): + if not reverse: + new_name = f"{left.name}{separator}{right.name}" + else: + new_name = f"{right.name}{separator}{left.name}" + return _InteractableDenseVector(left.values * right.values, name=new_name) + + else: + return _interact(right, left, reverse=not reverse, separator=separator) + + if isinstance(left, _InteractableSparseVector): + if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): + if not reverse: + new_name = f"{left.name}{separator}{right.name}" + else: + new_name = f"{right.name}{separator}{left.name}" + return _InteractableSparseVector( + left.values.multiply(right.values.reshape((-1, 1))), + name=new_name, + ) + + else: + return _interact(right, left, reverse=not reverse, separator=separator) + + if isinstance(left, _InteractableCategoricalVector): + if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): + if isinstance(right, _InteractableDenseVector): + right_values = right.values + else: + right_values = right.values.toarray().squeeze() + if not reverse: + new_categories = [ + f"{cat}{separator}{right.name}" for cat in left.categories + ] + new_name = f"{left.name}{separator}{right.name}" + else: + new_categories = [ + f"{right.name}{separator}{cat}" for cat in left.categories + ] + new_name = f"{right.name}{separator}{left.name}" + return _InteractableCategoricalVector( + codes=left.codes, + categories=new_categories, + multipliers=left.multipliers * right_values, + name=new_name, + ) + + elif isinstance(right, _InteractableCategoricalVector): + if not reverse: + return _interact_categoricals(left, right, separator=separator) + else: + return _interact_categoricals(right, left, separator=separator) + + raise TypeError( + f"Cannot interact {type(left).__name__} with {type(right).__name__}" + ) + + +def _interact_categoricals( + left: _InteractableCategoricalVector, + right: _InteractableCategoricalVector, + separator=":", +) -> _InteractableCategoricalVector: + """Interact two categorical vectors. + + Parameters + ---------- + left : _InteractableCategoricalVector + The left categorical vector. + right : _InteractableCategoricalVector + The right categorical vector. + separator : str, optional + The separator to use between the names of the interacted vectors, by default ":" + + Returns + ------- + _InteractableCategoricalVector + The interacted categorical vector. + """ + cardinality_left = len(left.categories) + new_codes = right.codes * cardinality_left + left.codes + + na_mask = (left.codes == -1) | (right.codes == -1) + drop_mask = (left.codes == -2) | (right.codes == -2) + + new_codes[na_mask] = -1 + new_codes[drop_mask] = -2 + + new_categories = [ + f"{left_cat}{separator}{right_cat}" + for right_cat, left_cat in itertools.product(right.categories, left.categories) + ] + + return _InteractableCategoricalVector( + codes=new_codes, + categories=new_categories, + multipliers=left.multipliers * right.multipliers, + name=f"{left.name}{separator}{right.name}", + ) + + +def _C( + data, + *, + levels: Optional[Iterable[str]] = None, + missing_method: str = "fail", + missing_name: str = "(MISSING)", + spans_intercept: bool = True, +): + """ + Mark data as categorical. + + A reduced-functionality version of the ``formulaic`` ``C()`` function. It does not + support custom contrasts or the level argument, but it allows setting + ``spans_intercept=False`` to avoid dropping categories. + """ + + def encoder( + values: Any, + reduced_rank: bool, + drop_rows: list[int], + encoder_state: dict[str, Any], + model_spec: ModelSpec, + ): + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + return encode_contrasts( + values, + levels=levels, + reduced_rank=reduced_rank, + missing_method=missing_method, + missing_name=missing_name, + _state=encoder_state, + _spec=model_spec, + ) + + return FactorValues( + data, + kind="categorical", + spans_intercept=spans_intercept, + encoder=encoder, + ) + + +@stateful_transform +def encode_contrasts( + data, + *, + levels: Optional[Iterable[str]] = None, + missing_method: str = "fail", + missing_name: str = "(MISSING)", + reduced_rank: bool = False, + _state=None, + _spec=None, +) -> FactorValues[_InteractableCategoricalVector]: + """ + Encode a categorical dataset into one an _InteractableCategoricalVector + + Parameters + ---------- + data: The categorical data array/series to be encoded. + levels: The complete set of levels (categories) posited to be present in + the data. This can also be used to reorder the levels as needed. + reduced_rank: Whether to reduce the rank of output encoded columns in + order to avoid spanning the intercept. + """ + levels = levels if levels is not None else _state.get("categories") + add_missing_category = _state.get("add_missing_category", False) + + # Check for unseen categories when levels are specified + if levels is not None: + if missing_method == "convert" and not add_missing_category: + # We only need to include NAs in the check in this case because: + # - missing_method == "fail" raises a more appropriate error later + # - missings are no problem in the other cases + unseen_categories = set(data.unique()) - set(levels) + else: + unseen_categories = set(data.dropna().unique()) - set(levels) + + if unseen_categories: + raise ValueError( + f"Column {data.name} contains unseen categories: {unseen_categories}." + ) + + cat = pandas.Categorical(data._values, categories=levels) + _state["categories"] = cat.categories + _state["add_missing_category"] = add_missing_category or ( + missing_method == "convert" and cat.isna().any() + ) + + return _InteractableCategoricalVector.from_categorical( + cat, + reduced_rank=reduced_rank, + missing_method=missing_method, + missing_name=missing_name, + add_missing_category=add_missing_category, + ) + + +def _replace_sequence(lst: list[str], sequence: list[str], replacement: "str") -> None: + """Replace a sequence of elements in a list with a single element. + + Raises a ValueError if the sequence is not in the list in the correct + order. Only checks for the first possible start of the sequence. + + Parameters + ---------- + lst : List[str] + The list to replace elements in. + sequence : List[str] + The sequence of elements to replace. + replacement : str + The element to replace the sequence with. + """ + try: + start = lst.index(sequence[0]) + except ValueError: + start = 0 # Will handle this below + + for elem in sequence: + if lst[start] != elem: + raise ValueError("The sequence is not in the list") + del lst[start] + + lst.insert(start, replacement) diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index 17739012..b1236f52 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -165,6 +165,76 @@ def standardize( def __getitem__(self, item): pass + @abstractmethod + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[list[int]] = None, + ) -> list[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + following pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + list[Optional[str]] + Column names. + """ + pass + + def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: list[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + """ + pass + + @property + def column_names(self): + """Column names of the matrix.""" + return self.get_names(type="column") + + @column_names.setter + def column_names(self, names: list[Optional[str]]): + self.set_names(names, type="column") + + @property + def term_names(self): + """Term names of the matrix. + + For differences between column names and term names, see ``get_names``. + """ + return self.get_names(type="term") + + @term_names.setter + def term_names(self, names: list[Optional[str]]): + self.set_names(names, type="term") + # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the # behavior of this class __array_priority__ = 11 diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 4633e680..256f7caa 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -14,6 +14,7 @@ ) from .matrix_base import MatrixBase from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -22,7 +23,7 @@ ) -class SparseMatrix(sps.csc_matrix, MatrixBase): +class SparseMatrix(MatrixBase): """ A scipy.sparse csc matrix subclass that allows such objects to conform to the ``MatrixBase`` interface. @@ -30,37 +31,151 @@ class SparseMatrix(sps.csc_matrix, MatrixBase): SparseMatrix is instantiated in the same way as scipy.sparse.csc_matrix. """ - def __init__(self, arg1, shape=None, dtype=None, copy=False): - super().__init__(arg1, shape, dtype, copy) - self.idx_dtype = max(self.indices.dtype, self.indptr.dtype) - if self.indices.dtype != self.idx_dtype: - self.indices = self.indices.astype(self.idx_dtype) - if self.indptr.dtype != self.idx_dtype: - self.indptr = self.indptr.astype(self.idx_dtype) + def __init__( + self, + input_array, + shape=None, + dtype=None, + copy=False, + column_names=None, + term_names=None, + ): + if isinstance(input_array, np.ndarray): + if input_array.ndim == 1: + input_array = input_array.reshape(-1, 1) + elif input_array.ndim > 2: + raise ValueError("Input array must be 1- or 2-dimensional") + + self._array = sps.csc_matrix(input_array, shape, dtype, copy) + + self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype) + if self._array.indices.dtype != self.idx_dtype: + self._array.indices = self._array.indices.astype(self.idx_dtype) + if self._array.indptr.dtype != self.idx_dtype: + self._array.indptr = self._array.indptr.astype(self.idx_dtype) assert self.indices.dtype == self.idx_dtype - if not self.has_sorted_indices: - self.sort_indices() - self._x_csr = None + if not self._array.has_sorted_indices: + self._array.sort_indices() + self._array_csr = None + + if column_names is not None: + if len(column_names) != self.shape[1]: + raise ValueError( + f"Expected {self.shape[1]} column names, got {len(column_names)}" + ) + self._colnames = column_names + else: + self._colnames = [None] * self.shape[1] + + if term_names is not None: + if len(term_names) != self.shape[1]: + raise ValueError( + f"Expected {self.shape[1]} term names, got {len(term_names)}" + ) + self._terms = term_names + else: + self._terms = self._colnames + + def __getitem__(self, key): + row, col = _check_indexer(key) + colnames = list(np.array(self.column_names)[col].ravel()) + terms = list(np.array(self.term_names)[col].ravel()) + + return type(self)( + self._array.__getitem__((row, col)), column_names=colnames, term_names=terms + ) + + def __matmul__(self, other): + return self._array.__matmul__(other) + + def __rmatmul__(self, other): + return self._array.__rmatmul__(other) + + __array_ufunc__ = None @property - def x_csr(self): + def shape(self): + """Tuple of array dimensions.""" + return self._array.shape + + @property + def ndim(self): + """Number of array dimensions.""" # noqa: D401 + return self._array.ndim + + @property + def dtype(self): + """Data-type of the array’s elements.""" # noqa: D401 + return self._array.dtype + + @property + def indices(self): + """Indices of the matrix.""" # noqa: D401 + return self._array.indices + + @property + def indptr(self): + """Indptr of the matrix.""" # noqa: D401 + return self._array.indptr + + @property + def data(self): + """Data of the matrix.""" # noqa: D401 + return self._array.data + + @property + def array_csc(self): + """Return the CSC representation of the matrix.""" + return self._array + + @property + def array_csr(self): """Cache the CSR representation of the matrix.""" - if self._x_csr is None: - self._x_csr = self.tocsr(copy=False) - if self._x_csr.indices.dtype != self.idx_dtype: - self._x_csr.indices = self._x_csr.indices.astype(self.idx_dtype) - if self._x_csr.indptr.dtype != self.idx_dtype: - self._x_csr.indptr = self._x_csr.indptr.astype(self.idx_dtype) + if self._array_csr is None: + self._array_csr = self._array.tocsr(copy=False) + if self._array_csr.indices.dtype != self.idx_dtype: + self._array_csr.indices = self._array_csr.indices.astype(self.idx_dtype) + if self._array_csr.indptr.dtype != self.idx_dtype: + self._array_csr.indptr = self._array_csr.indptr.astype(self.idx_dtype) + + return self._array_csr - return self._x_csr + def tocsc(self, copy=False): + """Return the matrix in CSC format.""" + return self._array.tocsc(copy=copy) + + def transpose(self): + """Returns a view of the array with axes transposed.""" # noqa: D401 + return type(self)(self._array.T) + + T = property(transpose) + + def getcol(self, i): + """Return matrix column at specified index.""" + return type(self)( + self._array.getcol(i), + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) + + def unpack(self): + """Return the underlying scipy.sparse.csc_matrix.""" + return self._array + + def toarray(self): + """Return a dense ndarray representation of the matrix.""" + return self._array.toarray() + + def dot(self, other): + """Return the dot product as a scipy sparse matrix.""" + return self._array.dot(other) def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ X.""" - if not hasattr(d, "dtype"): - d = np.asarray(d) + d = np.asarray(d) if not self.dtype == d.dtype: raise TypeError( f"""self and d need to be of same dtype, either np.float64 @@ -69,7 +184,7 @@ def sandwich( ) rows, cols = setup_restrictions(self.shape, rows, cols, dtype=self.idx_dtype) - return sparse_sandwich(self, self.x_csr, d, rows, cols) + return sparse_sandwich(self, self.array_csr, d, rows, cols) def _cross_sandwich( self, @@ -80,9 +195,11 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ): """Perform a sandwich product: X.T @ diag(d) @ Y.""" - if isinstance(other, np.ndarray): - return self.sandwich_dense(other, d, rows, L_cols, R_cols) from .categorical_matrix import CategoricalMatrix + from .dense_matrix import DenseMatrix + + if isinstance(other, DenseMatrix): + return self.sandwich_dense(other._array, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return other._cross_sandwich(self, d, rows, R_cols, L_cols).T @@ -111,7 +228,7 @@ def sandwich_dense( rows, L_cols = setup_restrictions(self.shape, rows, L_cols) R_cols = set_up_rows_or_cols(R_cols, B.shape[1]) - return csr_dense_sandwich(self.x_csr, B, d, rows, L_cols, R_cols) + return csr_dense_sandwich(self.array_csr, B, d, rows, L_cols, R_cols) def _matvec_helper( self, @@ -128,9 +245,11 @@ def _matvec_helper( unrestricted_cols = cols is None or len(cols) == self.shape[1] if unrestricted_rows and unrestricted_cols and vec.ndim == 1: if transpose: - return csc_rmatvec_unrestricted(self, vec, out, self.indices) + return csc_rmatvec_unrestricted(self.array_csc, vec, out, self.indices) else: - return csr_matvec_unrestricted(self.x_csr, vec, out, self.x_csr.indices) + return csr_matvec_unrestricted( + self.array_csr, vec, out, self.array_csr.indices + ) matrix_matvec = lambda x, v: sps.csc_matrix.dot(x, v) if transpose: @@ -138,16 +257,16 @@ def _matvec_helper( rows, cols = setup_restrictions(self.shape, rows, cols, dtype=self.idx_dtype) if transpose: - fast_fnc = lambda v: csc_rmatvec(self, v, rows, cols) + fast_fnc = lambda v: csc_rmatvec(self.array_csc, v, rows, cols) else: - fast_fnc = lambda v: csr_matvec(self.x_csr, v, rows, cols) + fast_fnc = lambda v: csr_matvec(self.array_csr, v, rows, cols) if vec.ndim == 1: res = fast_fnc(vec) elif vec.ndim == 2 and vec.shape[1] == 1: res = fast_fnc(vec[:, 0])[:, None] else: res = matrix_matvec( - self[np.ix_(rows, cols)], vec[rows] if transpose else vec[cols] + self[np.ix_(rows, cols)]._array, vec[rows] if transpose else vec[cols] ) if out is None: return res @@ -162,8 +281,6 @@ def matvec(self, vec, cols: np.ndarray = None, out: np.ndarray = None): check_matvec_out_shape(self, out) return self._matvec_helper(vec, None, cols, out, False) - __array_priority__ = 12 - def transpose_matvec( self, vec: Union[np.ndarray, list], @@ -179,7 +296,11 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra """Get standard deviations of columns.""" sqrt_arg = ( transpose_square_dot_weights( - self.data, self.indices, self.indptr, weights, weights.dtype + self._array.data, + self._array.indices, + self._array.indptr, + weights, + weights.dtype, ) - col_means**2 ) @@ -191,7 +312,7 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return SparseMatrix cast to new type.""" - return super().astype(dtype, casting, copy) + return type(self)(self._array.astype(dtype, casting, copy)) def multiply(self, other): """Element-wise multiplication. @@ -200,6 +321,87 @@ def multiply(self, other): from the parent class except that ``other`` is assumed to be a vector of size ``self.shape[0]``. """ - if other.ndim == 1: - return SparseMatrix(super().multiply(other[:, np.newaxis])) - return SparseMatrix(super().multiply(other)) + if np.asanyarray(other).ndim == 1: + return type(self)( + self._array.multiply(other[:, np.newaxis]), + column_names=self.column_names, + term_names=self.term_names, + ) + return type(self)( + self._array.multiply(other), + column_names=self.column_names, + term_names=self.term_names, + ) + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[list[int]] = None, + ) -> list[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + following pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + list[Optional[str]] + Column names. + """ + if type == "column": + names = np.array(self._colnames) + elif type == "term": + names = np.array(self._terms) + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + + if indices is None: + indices = list(range(len(self._colnames))) + + if missing_prefix is not None: + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + + return list(names) + + def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: list[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + if type == "column": + self._colnames = names + elif type == "term": + self._terms = names + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index 18f65a80..2210d55e 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -1,10 +1,10 @@ import warnings -from typing import Any, Optional, Union +from collections.abc import Sequence +from typing import Optional, Union import numpy as np from scipy import sparse as sps -from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix from .ext.split import is_sorted, split_col_subsets from .matrix_base import MatrixBase @@ -17,7 +17,7 @@ ) -def as_mx(a: Any): +def as_tabmat(a: Union[MatrixBase, StandardizedMatrix, np.ndarray, sps.spmatrix]): """Convert an array to a corresponding MatrixBase type. If the input is already a MatrixBase, return untouched. @@ -28,13 +28,37 @@ def as_mx(a: Any): if isinstance(a, (MatrixBase, StandardizedMatrix)): return a elif sps.issparse(a): - return SparseMatrix(a) + return SparseMatrix(a.tocsc(copy=False)) elif isinstance(a, np.ndarray): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") +def hstack(tup: Sequence[Union[MatrixBase, np.ndarray, sps.spmatrix]]) -> MatrixBase: + """Stack arrays in sequence horizontally (column wise). + + This is equivalent to concatenation along the second axis, + except for 1-D arrays where it concatenates along the first axis. + + Parameters + ---------- + tup: sequence of arrays + The arrays must have the same shape along all but the second axis. + """ + matrices = [as_tabmat(a) for a in tup] + + if len(matrices) == 0: + raise ValueError("Need at least one array to concatenate.") + + if all(isinstance(mat, SparseMatrix) for mat in matrices): + return SparseMatrix(sps.hstack([mat._array for mat in matrices])) + elif all(isinstance(mat, DenseMatrix) for mat in matrices): + return DenseMatrix(np.hstack([mat._array for mat in matrices])) + else: + return SplitMatrix(matrices) + + def _prepare_out_array(out: Optional[np.ndarray], out_shape, out_dtype): if out is None: out = np.zeros(out_shape, out_dtype) @@ -76,8 +100,14 @@ def _combine_matrices(matrices, indices): n_row = matrices[0].shape[0] for mat_type_, stack_fn in [ - (DenseMatrix, np.hstack), - (SparseMatrix, sps.hstack), + ( + DenseMatrix, + lambda matrices: np.hstack([mat._array for mat in matrices]), + ), + ( + SparseMatrix, + lambda matrices: sps.hstack([mat._array for mat in matrices]), + ), ]: this_type_matrices = [ i for i, mat in enumerate(matrices) if isinstance(mat, mat_type_) @@ -85,8 +115,16 @@ def _combine_matrices(matrices, indices): if len(this_type_matrices) > 1: new_matrix = mat_type_(stack_fn([matrices[i] for i in this_type_matrices])) new_indices = np.concatenate([indices[i] for i in this_type_matrices]) + new_colnames = np.concatenate( + [np.array(matrices[i]._colnames) for i in this_type_matrices] + ) + new_terms = np.concatenate( + [np.array(matrices[i]._terms) for i in this_type_matrices] + ) sorter = np.argsort(new_indices) sorted_matrix = new_matrix[:, sorter] + sorted_matrix._colnames = list(new_colnames[sorter]) + sorted_matrix._terms = list(new_terms[sorter]) sorted_indices = new_indices[sorter] assert sorted_matrix.shape[0] == n_row @@ -130,7 +168,7 @@ class SplitMatrix(MatrixBase): def __init__( self, - matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]], + matrices: Sequence[MatrixBase], indices: Optional[list[np.ndarray]] = None, ): flatten_matrices = [] @@ -144,7 +182,7 @@ def __init__( if isinstance(mat, SplitMatrix): # Flatten out the SplitMatrix current_idx = 0 - for iind, imat in zip(mat.indices, mat.matrices): + for iind, imat in zip(mat.indices, mat.matrices): # type: ignore flatten_matrices.append(imat) index_corrections.append( iind - np.arange(len(iind), dtype=np.int64) - current_idx @@ -449,3 +487,60 @@ def __repr__(self): return out __array_priority__ = 13 + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[list[int]] = None, + ) -> list[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + following pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + list[Optional[str]] + Column names. + """ + names = np.empty(self.shape[1], dtype=object) + for idx, mat in zip(self.indices, self.matrices): + names[idx] = mat.get_names(type, missing_prefix, idx) + return list(names) + + def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: list[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + """ + names_array = np.array(names) + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + for idx, mat in zip(self.indices, self.matrices): + mat.set_names(list(names_array[idx]), type) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index d35ed53b..0b609463 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Optional, Union import numpy as np from scipy import sparse as sps @@ -147,7 +147,7 @@ def sandwich( limited_shift = self.shift[cols] if cols is not None else self.shift limited_d = d[rows] if rows is not None else d - term3_and_4 = np.outer(limited_shift, d_mat + limited_shift * limited_d.sum()) + term3_and_4 = np.outer(limited_shift, d_mat + limited_shift * np.sum(limited_d)) res = term2 + term3_and_4 if isinstance(term1, sps.dia_matrix): idx = np.arange(res.shape[0]) @@ -298,3 +298,72 @@ def __repr__(self): Mult: {self.mult} """ return out + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[list[int]] = None, + ) -> list[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + following pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + list[Optional[str]] + Column names. + """ + return self.mat.get_names(type, missing_prefix, indices) + + def set_names(self, names: Union[str, list[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: list[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is + that a categorical submatrix counts as one term, but can count as + multiple columns. Furthermore, matrices created from formulas + distinguish between columns and terms (c.f. ``formulaic`` docs). + """ + self.mat.set_names(names, type) + + @property + def column_names(self): + """Column names of the matrix.""" + return self.get_names(type="column") + + @column_names.setter + def column_names(self, names: list[Optional[str]]): + self.set_names(names, type="column") + + @property + def term_names(self): + """Term names of the matrix. + + For differences between column names and term names, see ``get_names``. + """ + return self.get_names(type="term") + + @term_names.setter + def term_names(self, names: list[Optional[str]]): + self.set_names(names, type="term") diff --git a/src/tabmat/util.py b/src/tabmat/util.py index 83d05f26..1bd8842e 100644 --- a/src/tabmat/util.py +++ b/src/tabmat/util.py @@ -50,3 +50,51 @@ def check_matvec_dimensions(mat, vec: np.ndarray, transpose: bool) -> None: f"shapes {mat.shape} and {vec.shape} not aligned: " f"{mat.shape[match_dim]} (dim {match_dim}) != {vec.shape[0]} (dim 0)" ) + + +def _check_indexer(indexer): + """Check that the indexer is valid, and transform it to a canonical format.""" + if not isinstance(indexer, tuple): + indexer = (indexer, slice(None, None, None)) + + if len(indexer) > 2: + raise ValueError("More than two indexers are not supported.") + + row_indexer, col_indexer = indexer + + if isinstance(row_indexer, slice): + if isinstance(col_indexer, slice): + return row_indexer, col_indexer + else: + col_indexer = np.asarray(col_indexer) + if col_indexer.ndim > 1: + raise ValueError( + "Indexing would result in a matrix with more than 2 dimensions." + ) + else: + return row_indexer, col_indexer.reshape(-1) + + elif isinstance(col_indexer, slice): + row_indexer = np.asarray(row_indexer) + if row_indexer.ndim > 1: + raise ValueError( + "Indexing would result in a matrix with more than 2 dimensions." + ) + else: + return row_indexer.reshape(-1), col_indexer + + else: + row_indexer = np.asarray(row_indexer) + col_indexer = np.asarray(col_indexer) + if row_indexer.ndim <= 1 and col_indexer.ndim <= 1: + return np.ix_(row_indexer.reshape(-1), col_indexer.reshape(-1)) + elif ( + row_indexer.ndim == 2 + and row_indexer.shape[1] == 1 + and col_indexer.ndim == 2 + and col_indexer.shape[0] == 1 + ): + # support for np.ix_-ed indices + return row_indexer, col_indexer + else: + raise ValueError("This type of indexing is not supported.") diff --git a/tests/test_categorical_matrix.py b/tests/test_categorical_matrix.py index f1be385a..9e7cb7a9 100644 --- a/tests/test_categorical_matrix.py +++ b/tests/test_categorical_matrix.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd import pytest @@ -6,55 +8,146 @@ @pytest.fixture -def cat_vec(): +def cat_vec(missing): m = 10 seed = 0 rng = np.random.default_rng(seed) - return rng.choice([0, 1, 2, np.inf, -np.inf], size=m) + vec = rng.choice([0, 1, 2, np.inf, -np.inf], size=m) + if missing: + vec[vec == 1] = np.nan + return vec @pytest.mark.parametrize("vec_dtype", [np.float64, np.float32, np.int64, np.int32]) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_recover_orig(cat_vec, vec_dtype, drop_first): - orig_recovered = CategoricalMatrix(cat_vec, drop_first=drop_first).recover_orig() +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_recover_orig(cat_vec, vec_dtype, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + orig_recovered = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ).recover_orig() np.testing.assert_equal(orig_recovered, cat_vec) @pytest.mark.parametrize("vec_dtype", [np.float64, np.float32, np.int64, np.int32]) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_csr_matvec_categorical(cat_vec, vec_dtype, drop_first): - mat = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8") - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_csr_matvec_categorical( + cat_vec, vec_dtype, drop_first, missing, cat_missing_method +): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + mat = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=(cat_missing_method == "convert" and missing), + ) + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) vec = np.random.choice(np.arange(4, dtype=vec_dtype), mat.shape[1]) res = cat_mat.matvec(vec) np.testing.assert_allclose(res, cat_mat.A.dot(vec)) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_tocsr(cat_vec, drop_first): - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_tocsr(cat_vec, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) res = cat_mat.tocsr().A - expected = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8") + expected = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=cat_missing_method == "convert" and missing, + ) np.testing.assert_allclose(res, expected) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_transpose_matvec(cat_vec, drop_first): - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_transpose_matvec(cat_vec, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) other = np.random.random(cat_mat.shape[0]) res = cat_mat.transpose_matvec(other) - expected = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8").T.dot( - other - ) + expected = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=cat_missing_method == "convert" and missing, + ).T.dot(other) np.testing.assert_allclose(res, expected) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_multiply(cat_vec, drop_first): - cat_mat = CategoricalMatrix(cat_vec, drop_first=drop_first) +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_multiply(cat_vec, drop_first, missing, cat_missing_method): + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + cat_mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) other = np.arange(len(cat_vec))[:, None] actual = cat_mat.multiply(other) - expected = pd.get_dummies(cat_vec, drop_first=drop_first, dtype="uint8") * other + expected = ( + pd.get_dummies( + cat_vec, + drop_first=drop_first, + dtype="uint8", + dummy_na=cat_missing_method == "convert" and missing, + ) + * other + ) np.testing.assert_allclose(actual.A, expected) @@ -65,9 +158,48 @@ def test_nulls(mi_element): CategoricalMatrix(vec) -@pytest.mark.parametrize("drop_first", [True, False]) -def test_categorical_indexing(drop_first): - catvec = [0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 3] - mat = CategoricalMatrix(catvec, drop_first=drop_first) - expected = pd.get_dummies(catvec, drop_first=drop_first).to_numpy()[:, [0, 1]] +@pytest.mark.parametrize("cat_missing_name", ["(MISSING)", "__None__", "[NULL]"]) +def test_cat_missing_name(cat_missing_name): + vec = [None, "(MISSING)", "__None__", "a", "b"] + if cat_missing_name in vec: + with pytest.raises( + ValueError, + match=re.escape(f"Missing category {cat_missing_name} already exists."), + ): + CategoricalMatrix( + vec, cat_missing_method="convert", cat_missing_name=cat_missing_name + ) + else: + cat = CategoricalMatrix( + vec, cat_missing_method="convert", cat_missing_name=cat_missing_name + ) + assert set(cat.cat.categories) == set(vec) - {None} | {cat_missing_name} + + +@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"]) +@pytest.mark.parametrize("missing", [True, False], ids=["missing", "no_missing"]) +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_categorical_indexing(drop_first, missing, cat_missing_method): + if not missing: + cat_vec = [0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 3] + else: + cat_vec = [0, None, 2, 0, None, 2, 0, None, 2, 3, 3] + + if missing and cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + return + + mat = CategoricalMatrix( + cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method + ) + expected = pd.get_dummies( + cat_vec, + drop_first=drop_first, + dummy_na=cat_missing_method == "convert" and missing, + ).to_numpy()[:, [0, 1]] np.testing.assert_allclose(mat[:, [0, 1]].A, expected) diff --git a/tests/test_formula.py b/tests/test_formula.py new file mode 100644 index 00000000..0227f7a4 --- /dev/null +++ b/tests/test_formula.py @@ -0,0 +1,1128 @@ +import pickle +import re +from io import BytesIO + +import formulaic +import numpy as np +import pandas as pd +import pytest +from formulaic.materializers import FormulaMaterializer +from formulaic.materializers.types import EvaluatedFactor, FactorValues +from formulaic.parser.types import Factor +from scipy import sparse as sps + +import tabmat as tm +from tabmat.formula import ( + TabmatMaterializer, + _interact, + _InteractableCategoricalVector, + _InteractableDenseVector, + _InteractableSparseVector, +) + + +@pytest.fixture +def df(): + df = pd.DataFrame( + { + "num_1": [1.0, 2.0, 3.0, 4.0, 5.0], + "num_2": [5.0, 4.0, 3.0, 2.0, 1.0], + "cat_1": pd.Categorical(["a", "b", "c", "b", "a"]), + "cat_2": pd.Categorical(["x", "y", "z", "x", "y"]), + "cat_3": pd.Categorical(["1", "2", "1", "2", "1"]), + "str_1": ["a", "b", "c", "b", "a"], + } + ) + return df + + +def test_retrieval(): + assert FormulaMaterializer.for_materializer("tabmat") is TabmatMaterializer + assert ( + FormulaMaterializer.for_data(pd.DataFrame(), output="tabmat") + is TabmatMaterializer + ) + + +@pytest.mark.parametrize( + "formula, expected", + [ + pytest.param( + "1 + num_1", + tm.SplitMatrix( + [ + tm.DenseMatrix( + np.array( + [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0, 5.0]] + ).T + ) + ] + ), + id="numeric", + ), + pytest.param( + "1 + cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.CategoricalMatrix( + pd.Categorical( + [ + "__drop__", + "cat_1[b]", + "cat_1[c]", + "cat_1[b]", + "__drop__", + ], + categories=["__drop__", "cat_1[b]", "cat_1[c]"], + ), + drop_first=True, + ), + ] + ), + id="categorical", + ), + pytest.param( + "{np.where(num_1 >= 2, num_1, 0)} * {np.where(num_2 <= 2, num_2, 0)}", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[0.0, 2.0, 3.0, 4.0, 5.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 2.0, 0.0, 0.0, 0.0], + [0.0, 2.0, 0.0, 0.0, 0.0], + ] + ) + ).T + ), + ] + ), + id="numeric_sparse", + ), + pytest.param( + "1 + num_1 : cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 0.0, 0.0, 0.0, 5.0], + [0.0, 2.0, 0.0, 4.0, 0.0], + [0.0, 0.0, 3.0, 0.0, 0.0], + ] + ).T + ) + ), + ] + ), + id="interaction_cat_num", + ), + pytest.param( + "cat_1 : cat_3 - 1", + tm.SplitMatrix( + [ + tm.CategoricalMatrix( + pd.Categorical( + [ + "cat_1[a]:cat_3[1]", + "cat_1[b]:cat_3[2]", + "cat_1[c]:cat_3[1]", + "cat_1[b]:cat_3[2]", + "cat_1[a]:cat_3[1]", + ], + categories=[ + "cat_1[a]:cat_3[1]", + "cat_1[b]:cat_3[1]", + "cat_1[c]:cat_3[1]", + "cat_1[a]:cat_3[2]", + "cat_1[c]:cat_3[2]", + "cat_1[b]:cat_3[2]", + ], + ), + drop_first=False, + ), + ] + ), + id="interaction_cat_cat", + ), + ], +) +def test_matrix_against_expectation(df, formula, expected): + model_df = tm.from_formula( + formula, df, ensure_full_rank=True, cat_threshold=1, sparse_threshold=0.5 + ) + assert len(model_df.matrices) == len(expected.matrices) + for res, exp in zip(model_df.matrices, expected.matrices): + assert type(res) == type(exp) + if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): + np.testing.assert_array_equal(res.A, res.A) + elif isinstance(res, tm.CategoricalMatrix): + assert (exp.cat == res.cat).all() + assert exp.drop_first == res.drop_first + + +@pytest.mark.parametrize( + "formula, expected", + [ + pytest.param( + "1 + num_1", + tm.SplitMatrix( + [ + tm.DenseMatrix( + np.array( + [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0, 5.0]] + ).T + ) + ] + ), + id="numeric", + ), + pytest.param( + "1 + cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.CategoricalMatrix( + pd.Categorical( + [ + "__drop__", + "cat_1__b", + "cat_1__c", + "cat_1__b", + "__drop__", + ], + categories=["__drop__", "cat_1__b", "cat_1__c"], + ), + drop_first=True, + ), + ] + ), + id="categorical", + ), + pytest.param( + "1 + num_1 : cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 0.0, 0.0, 0.0, 5.0], + [0.0, 2.0, 0.0, 4.0, 0.0], + [0.0, 0.0, 3.0, 0.0, 0.0], + ] + ).T + ) + ), + ] + ), + id="interaction_cat_num", + ), + pytest.param( + "cat_1 : cat_3 - 1", + tm.SplitMatrix( + [ + tm.CategoricalMatrix( + pd.Categorical( + [ + "cat_1__a__x__cat_3__1", + "cat_1__b__x__cat_3__2", + "cat_1__c__x__cat_3__1", + "cat_1__b__x__cat_3__2", + "cat_1__a__x__cat_3__1", + ], + categories=[ + "cat_1__a__x__cat_3__1", + "cat_1__b__x__cat_3__1", + "cat_1__c__x__cat_3__1", + "cat_1__a__x__cat_3__2", + "cat_1__c__x__cat_3__2", + "cat_1__b__x__cat_3__2", + ], + ), + drop_first=False, + ), + ] + ), + id="interaction_cat_cat", + ), + ], +) +def test_matrix_against_expectation_qcl(df, formula, expected): + model_df = tm.from_formula( + formula, + df, + cat_threshold=1, + sparse_threshold=0.5, + ensure_full_rank=True, + interaction_separator="__x__", + categorical_format="{name}__{category}", + intercept_name="intercept", + ) + assert len(model_df.matrices) == len(expected.matrices) + for res, exp in zip(model_df.matrices, expected.matrices): + assert type(res) == type(exp) + if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): + np.testing.assert_array_equal(res.A, res.A) + elif isinstance(res, tm.CategoricalMatrix): + assert (exp.cat == res.cat).all() + assert exp.drop_first == res.drop_first + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("num_1 + num_2", id="numeric"), + pytest.param("cat_1 + cat_2", id="categorical"), + pytest.param("cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("num_1 + cat_1 * num_2 * cat_2", id="mixed"), + pytest.param("{np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("{num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("bs(num_1, 3)", id="spline"), + pytest.param( + "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + ), + pytest.param( + "C(num_1)", + id="convert_to_categorical", + ), + pytest.param( + "C(cat_1, spans_intercept=False) * cat_2 * cat_3", + id="custom_contrasts", + ), + pytest.param("str_1", id="string_as_categorical"), + ], +) +def test_matrix_against_pandas(df, formula, ensure_full_rank): + num_in_scope = 2 # noqa + model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=ensure_full_rank, + include_intercept=True, + context=0, + ) + np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param( + "1 + num_1 + num_2", ("Intercept", "num_1", "num_2"), id="numeric" + ), + pytest.param("num_1 + num_2 - 1", ("num_1", "num_2"), id="no_intercept"), + pytest.param( + "1 + cat_1", ("Intercept", "cat_1[b]", "cat_1[c]"), id="categorical" + ), + pytest.param( + "1 + cat_2 * cat_3", + ( + "Intercept", + "cat_2[y]", + "cat_2[z]", + "cat_3[2]", + "cat_2[y]:cat_3[2]", + "cat_2[z]:cat_3[2]", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)[1]", "poly(num_1, 3)[2]", "poly(num_1, 3)[3]"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("Intercept", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_names_against_expectation(df, formula, expected_names): + model_tabmat = tm.from_formula(formula, df, ensure_full_rank=True) + assert model_tabmat.model_spec.column_names == expected_names + assert model_tabmat.column_names == list(expected_names) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param( + "1 + cat_1", ("intercept", "cat_1__b", "cat_1__c"), id="categorical" + ), + pytest.param( + "1 + cat_2 * cat_3", + ( + "intercept", + "cat_2__y", + "cat_2__z", + "cat_3__2", + "cat_2__y__x__cat_3__2", + "cat_2__z__x__cat_3__2", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)[1]", "poly(num_1, 3)[2]", "poly(num_1, 3)[3]"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("intercept", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_names_against_expectation_qcl(df, formula, expected_names): + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=True, + categorical_format="{name}__{category}", + interaction_separator="__x__", + intercept_name="intercept", + ) + assert model_tabmat.model_spec.column_names == expected_names + assert model_tabmat.column_names == list(expected_names) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param("1 + cat_1", ("1", "cat_1", "cat_1"), id="categorical"), + pytest.param( + "1 + cat_2 * cat_3", + ( + "1", + "cat_2", + "cat_2", + "cat_3", + "cat_2:cat_3", + "cat_2:cat_3", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)", "poly(num_1, 3)", "poly(num_1, 3)"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("1", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_term_names_against_expectation(df, formula, expected_names): + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=True, + intercept_name="intercept", + ) + assert model_tabmat.term_names == list(expected_names) + + +@pytest.mark.parametrize( + "categorical_format", + ["{name}[{category}]", "{name}__{category}", "{name}<<{category}>>"], + ids=["brackets", "double_underscore", "custom"], +) +def test_all_names_against_from_pandas(df, categorical_format): + mat_from_pandas = tm.from_pandas( + df, drop_first=False, object_as_cat=True, categorical_format=categorical_format + ) + mat_from_formula = tm.from_formula( + "num_1 + num_2 + cat_1 + cat_2 + cat_3 + str_1 - 1", + data=df, + ensure_full_rank=False, + categorical_format=categorical_format, + ) + + assert mat_from_formula.column_names == mat_from_pandas.column_names + assert mat_from_formula.term_names == mat_from_pandas.term_names + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("1 + num_1 + num_2", id="numeric"), + pytest.param("1 + cat_1 + cat_2", id="categorical"), + pytest.param("1 + cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("1 + num_1 + cat_1 * num_2 * cat_2", id="mixed"), + pytest.param("1 + {np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("1 + {num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("1 + bs(num_1, 3)", id="spline"), + pytest.param( + "1 + poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + ), + pytest.param( + "1 + C(num_1)", + id="convert_to_categorical", + ), + pytest.param( + "1 + C(cat_1, spans_intercept=False) * cat_2 * cat_3", + id="custom_contrasts", + ), + ], +) +def test_names_against_pandas(df, formula, ensure_full_rank): + num_in_scope = 2 # noqa + model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=ensure_full_rank, + categorical_format="{name}[T.{category}]", + context=0, + ) + assert model_tabmat.model_spec.column_names == model_df.model_spec.column_names + assert model_tabmat.model_spec.column_names == tuple(model_df.columns) + assert model_tabmat.column_names == list(model_df.columns) + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula, formula_with_intercept, formula_wo_intercept", + [ + ("num_1", "1 + num_1", "num_1 - 1"), + ("cat_1", "1 + cat_1", "cat_1 - 1"), + ( + "num_1 * cat_1 * cat_2", + "1 + num_1 * cat_1 * cat_2", + "num_1 * cat_1 * cat_2 - 1", + ), + ], + ids=["numeric", "categorical", "mixed"], +) +def test_include_intercept( + df, formula, formula_with_intercept, formula_wo_intercept, ensure_full_rank +): + model_no_include = tm.from_formula( + formula, df, include_intercept=False, ensure_full_rank=ensure_full_rank + ) + model_no_intercept = tm.from_formula( + formula_wo_intercept, + df, + include_intercept=True, + ensure_full_rank=ensure_full_rank, + ) + np.testing.assert_array_equal(model_no_include.A, model_no_intercept.A) + assert ( + model_no_include.model_spec.column_names + == model_no_intercept.model_spec.column_names + ) + + model_include = tm.from_formula( + formula, df, include_intercept=True, ensure_full_rank=ensure_full_rank + ) + model_intercept = tm.from_formula( + formula_with_intercept, + df, + include_intercept=False, + ensure_full_rank=ensure_full_rank, + ) + np.testing.assert_array_equal(model_include.A, model_intercept.A) + assert ( + model_no_include.model_spec.column_names + == model_no_intercept.model_spec.column_names + ) + + +@pytest.mark.parametrize( + "formula", + [ + pytest.param("str_1 : cat_1", id="implicit"), + pytest.param("C(str_1) : C(cat_1, spans_intercept=False)", id="explicit"), + ], +) +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +def test_C_state(df, formula, ensure_full_rank): + model_tabmat = tm.from_formula( + "str_1 : cat_1 + 1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank + ) + model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2]) + np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A) + np.testing.assert_array_equal( + model_tabmat.matrices[1].cat.categories, + model_tabmat_2.matrices[1].cat.categories, + ) + + +VECTORS = [ + _InteractableDenseVector(np.array([1, 2, 3, 4, 5], dtype=np.float64)).set_name( + "dense" + ), + _InteractableSparseVector( + sps.csc_matrix(np.array([[1, 0, 0, 0, 2]], dtype=np.float64).T) + ).set_name("sparse"), + _InteractableCategoricalVector.from_categorical( + pd.Categorical(["a", "b", "c", "b", "a"]), reduced_rank=True + ).set_name("cat_reduced"), + _InteractableCategoricalVector.from_categorical( + pd.Categorical(["a", "b", "c", "b", "a"]), reduced_rank=False + ).set_name("cat_full"), +] + + +@pytest.mark.parametrize( + "left", VECTORS, ids=["dense", "sparse", "cat_full", "cat_reduced"] +) +@pytest.mark.parametrize( + "right", VECTORS, ids=["dense", "sparse", "cat_full", "cat_reduced"] +) +@pytest.mark.parametrize("reverse", [False, True], ids=["not_reversed", "reversed"]) +def test_interactable_vectors(left, right, reverse): + n = left.to_tabmat().shape[0] + left_np = left.to_tabmat().A.reshape((n, -1)) + right_np = right.to_tabmat().A.reshape((n, -1)) + + if reverse: + left_np, right_np = right_np, left_np + + if isinstance(left, _InteractableCategoricalVector) and isinstance( + right, _InteractableCategoricalVector + ): + result_np = np.zeros((n, left_np.shape[1] * right_np.shape[1])) + for i in range(right_np.shape[1]): + for j in range(left_np.shape[1]): + result_np[:, i * left_np.shape[1] + j] = left_np[:, j] * right_np[:, i] + else: + result_np = left_np * right_np + + result_vec = _interact(left, right, reverse=reverse) + + # Test types + if isinstance(left, _InteractableCategoricalVector) or isinstance( + right, _InteractableCategoricalVector + ): + assert isinstance(result_vec, _InteractableCategoricalVector) + elif isinstance(left, _InteractableSparseVector) or isinstance( + right, _InteractableSparseVector + ): + assert isinstance(result_vec, _InteractableSparseVector) + else: + assert isinstance(result_vec, _InteractableDenseVector) + + # Test values + np.testing.assert_array_equal( + result_vec.to_tabmat().A.squeeze(), result_np.squeeze() + ) + + # Test names + if not reverse: + assert result_vec.name == left.name + ":" + right.name + else: + assert result_vec.name == right.name + ":" + left.name + + +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"]) +@pytest.mark.parametrize( + "cat_missing_name", + ["__missing__", "(MISSING)"], +) +def test_cat_missing_handling(cat_missing_method, cat_missing_name): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + } + ) + + mat_from_pandas = tm.from_pandas( + df, + cat_threshold=0, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + + mat_from_formula = tm.from_formula( + "cat_1 - 1", + df, + cat_threshold=0, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + + assert mat_from_pandas.column_names == mat_from_formula.column_names + assert mat_from_pandas.term_names == mat_from_formula.term_names + np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A) + + mat_from_formula_new = mat_from_formula.model_spec.get_model_matrix(df) + assert mat_from_pandas.column_names == mat_from_formula_new.column_names + assert mat_from_pandas.term_names == mat_from_formula_new.term_names + np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula_new.A) + + +def test_cat_missing_C(): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + "cat_2": pd.Categorical(["1", "2", None, "1", "2"]), + } + ) + formula = ( + "C(cat_1, missing_method='convert', missing_name='M') " + "+ C(cat_2, missing_method='zero')" + ) + expected_names = [ + "C(cat_1, missing_method='convert', missing_name='M')[a]", + "C(cat_1, missing_method='convert', missing_name='M')[b]", + "C(cat_1, missing_method='convert', missing_name='M')[M]", + "C(cat_2, missing_method='zero')[1]", + "C(cat_2, missing_method='zero')[2]", + ] + + result = tm.from_formula(formula, df) + + assert result.column_names == expected_names + assert result.model_spec.get_model_matrix(df).column_names == expected_names + np.testing.assert_equal(result.model_spec.get_model_matrix(df).A, result.A) + np.testing.assert_equal( + result.model_spec.get_model_matrix(df[:2]).A, result.A[:2, :] + ) + + +@pytest.mark.parametrize( + "cat_missing_method", ["zero", "convert"], ids=["zero", "convert"] +) +def test_cat_missing_unseen(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", None]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + result_unseen = result_seen.model_spec.get_model_matrix(df_unseen) + + assert result_seen.column_names == result_unseen.column_names + if cat_missing_method == "convert": + expected_array = np.array([[1, 0, 0], [0, 0, 1]], dtype=np.float64) + elif cat_missing_method == "zero": + expected_array = np.array([[1, 0], [0, 0]], dtype=np.float64) + + np.testing.assert_array_equal(result_unseen.A, expected_array) + + +def test_cat_missing_interactions(): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + "cat_2": pd.Categorical(["1", "2", None, "1", "2"]), + } + ) + formula = "C(cat_1, missing_method='convert') : C(cat_2, missing_method='zero') - 1" + expected_names = [ + "C(cat_1, missing_method='convert')[a]:C(cat_2, missing_method='zero')[1]", + "C(cat_1, missing_method='convert')[b]:C(cat_2, missing_method='zero')[1]", + "C(cat_1, missing_method='convert')[(MISSING)]:" + + "C(cat_2, missing_method='zero')[1]", + "C(cat_1, missing_method='convert')[a]:C(cat_2, missing_method='zero')[2]", + "C(cat_1, missing_method='convert')[b]:C(cat_2, missing_method='zero')[2]", + "C(cat_1, missing_method='convert')[(MISSING)]:" + + "C(cat_2, missing_method='zero')[2]", + ] + + assert tm.from_formula(formula, df).column_names == expected_names + + +@pytest.mark.parametrize( + "cat_missing_method", ["zero", "convert", "fail"], ids=["zero", "convert", "fail"] +) +def test_unseen_category(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", "c"]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + + +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert", "fail"]) +def test_unseen_missing(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", pd.NA]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + + if cat_missing_method == "convert": + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + elif cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + result_seen.model_spec.get_model_matrix(df_unseen) + elif cat_missing_method == "zero": + result_unseen = result_seen.model_spec.get_model_matrix(df_unseen) + assert result_unseen.A.shape == (3, 2) + np.testing.assert_array_equal( + result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]]) + ) + assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"] + + +# Tests from formulaic's test suite +# --------------------------------- + +FORMULAIC_TESTS = { + # '': (, , , ) + "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2), + "A": ( + ["Intercept", "A[b]", "A[c]"], + ["Intercept", "A[a]", "A[b]", "A[c]"], + ["Intercept", "A[c]"], + 2, + ), + "C(A)": ( + ["Intercept", "C(A)[b]", "C(A)[c]"], + ["Intercept", "C(A)[a]", "C(A)[b]", "C(A)[c]"], + ["Intercept", "C(A)[c]"], + 2, + ), + "A:a": ( + ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"], + ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"], + ["Intercept", "A[a]:a"], + 1, + ), + "A:B": ( + [ + "Intercept", + "B[b]", + "B[c]", + "A[b]:B[a]", + "A[c]:B[a]", + "A[b]:B[b]", + "A[c]:B[b]", + "A[b]:B[c]", + "A[c]:B[c]", + ], + [ + "Intercept", + "A[a]:B[a]", + "A[b]:B[a]", + "A[c]:B[a]", + "A[a]:B[b]", + "A[b]:B[b]", + "A[c]:B[b]", + "A[a]:B[c]", + "A[b]:B[c]", + "A[c]:B[c]", + ], + ["Intercept"], + 1, + ), +} + + +class TestFormulaicTests: + @pytest.fixture + def data(self): + return pd.DataFrame( + {"a": [1, 2, 3], "b": [1, 2, 3], "A": ["a", "b", "c"], "B": ["a", "b", "c"]} + ) + + @pytest.fixture + def data_with_nulls(self): + return pd.DataFrame( + {"a": [1, 2, None], "A": ["a", None, "c"], "B": ["a", "b", None]} + ) + + @pytest.fixture + def materializer(self, data): + return TabmatMaterializer(data) + + @pytest.mark.parametrize("formula,tests", FORMULAIC_TESTS.items()) + def test_get_model_matrix(self, materializer, formula, tests): + mm = materializer.get_model_matrix(formula, ensure_full_rank=True) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0])) + assert list(mm.model_spec.column_names) == tests[0] + + mm = materializer.get_model_matrix(formula, ensure_full_rank=False) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[1])) + assert list(mm.model_spec.column_names) == tests[1] + + def test_get_model_matrix_edge_cases(self, materializer): + mm = materializer.get_model_matrix(("a",), ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert isinstance(mm[0], tm.MatrixBase) + + mm = materializer.get_model_matrix("a ~ A", ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert "lhs" in mm.model_spec + assert "rhs" in mm.model_spec + + mm = materializer.get_model_matrix(("a ~ A",), ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert isinstance(mm[0], formulaic.ModelMatrices) + + def test_get_model_matrix_invalid_output(self, materializer): + with pytest.raises( + formulaic.errors.FormulaMaterializationError, + match=r"Nominated output .* is invalid\. Available output types are: ", + ): + materializer.get_model_matrix( + "a", ensure_full_rank=True, output="invalid_output" + ) + + @pytest.mark.parametrize("formula,tests", FORMULAIC_TESTS.items()) + def test_na_handling(self, data_with_nulls, formula, tests): + mm = TabmatMaterializer(data_with_nulls).get_model_matrix(formula) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (tests[3], len(tests[2])) + assert list(mm.model_spec.column_names) == tests[2] + + if formula != "a": + pytest.skip("Tabmat does not allo NAs in categoricals") + + mm = TabmatMaterializer(data_with_nulls).get_model_matrix( + formula, na_action="ignore" + ) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0]) + (-1 if "A" in formula else 0)) + + def test_state(self, materializer): + mm = materializer.get_model_matrix("center(a) - 1") + assert isinstance(mm, tm.MatrixBase) + assert list(mm.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm.getcol(0).unpack().squeeze(), [-1, 0, 1]) + + mm2 = TabmatMaterializer(pd.DataFrame({"a": [4, 5, 6]})).get_model_matrix( + mm.model_spec + ) + assert isinstance(mm2, tm.MatrixBase) + assert list(mm2.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm2.getcol(0).unpack().squeeze(), [2, 3, 4]) + + mm3 = mm.model_spec.get_model_matrix(pd.DataFrame({"a": [4, 5, 6]})) + assert isinstance(mm3, tm.MatrixBase) + assert list(mm3.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm3.getcol(0).unpack().squeeze(), [2, 3, 4]) + + def test_factor_evaluation_edge_cases(self, materializer): + # Test that categorical kinds are set if type would otherwise be numerical + ev_factor = materializer._evaluate_factor( + Factor("a", eval_method="lookup", kind="categorical"), + formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + assert ev_factor.metadata.kind.value == "categorical" + + # Test that other kind mismatches result in an exception + materializer.factor_cache = {} + with pytest.raises( + formulaic.errors.FactorEncodingError, + match=re.escape( + "Factor `A` is expecting values of kind 'numerical', " + "but they are actually of kind 'categorical'." + ), + ): + materializer._evaluate_factor( + Factor("A", eval_method="lookup", kind="numerical"), + formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + + # For an encoding has already been determined, an exception + # should be raised if the new encoding does not match + materializer.factor_cache = {} + with pytest.raises( + formulaic.errors.FactorEncodingError, + match=re.escape( + "The model specification expects factor `a` to have values of kind " + "`categorical`, but they are actually of kind `numerical`." + ), + ): + materializer._evaluate_factor( + Factor("a", eval_method="lookup", kind="numerical"), + formulaic.model_spec.ModelSpec( + formula=[], encoder_state={"a": ("categorical", {})} + ), + drop_rows=set(), + ) + + def test__is_categorical(self, materializer): + assert materializer._is_categorical([1, 2, 3]) is False + assert materializer._is_categorical(pd.Series(["a", "b", "c"])) is True + assert materializer._is_categorical(pd.Categorical(["a", "b", "c"])) is True + assert materializer._is_categorical(FactorValues({}, kind="categorical")) + + def test_encoding_edge_cases(self, materializer): + # Verify that constant encoding works well + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("10", eval_method="literal", kind="constant"), + values=FactorValues(10, kind="constant"), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + np.testing.assert_array_equal(encoded_factor["10"].values, [10, 10, 10]) + + # Verify that unencoded dictionaries with drop-fields work + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("a", eval_method="lookup", kind="numerical"), + values=FactorValues( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}, + kind="numerical", + spans_intercept=True, + drop_field="a", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + np.testing.assert_array_equal(encoded_factor["a[a]"].values, [1, 2, 3]) + np.testing.assert_array_equal(encoded_factor["a[b]"].values, [4, 5, 6]) + + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("a", eval_method="lookup", kind="numerical"), + values=FactorValues( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}, + kind="numerical", + spans_intercept=True, + drop_field="a", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + reduced_rank=True, + ) + np.testing.assert_array_equal(encoded_factor["a[b]"].values, [4, 5, 6]) + + # Verify that encoding of nested dictionaries works well + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("A", eval_method="python", kind="numerical"), + values=FactorValues( + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([4, 5, 6]), + "__metadata__": None, + }, + kind="numerical", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + np.testing.assert_array_equal(encoded_factor["A[a]"].values, [1, 2, 3]) + + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("B", eval_method="python", kind="categorical"), + values=FactorValues( + {"a": pd.Series(["a", "b", "c"])}, kind="categorical" + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + encoded_matrix = ( + encoded_factor["B[a]"].set_name("B[a]").to_tabmat(cat_threshold=1) + ) + assert list(encoded_matrix.cat) == ["B[a][a]", "B[a][b]", "B[a][c]"] + + def test_empty(self, materializer): + mm = materializer.get_model_matrix("0", ensure_full_rank=True) + assert mm.shape[1] == 0 + mm = materializer.get_model_matrix("0", ensure_full_rank=False) + assert mm.shape[1] == 0 + + def test_category_reordering(self): + data = pd.DataFrame({"A": ["a", "b", "c"]}) + data2 = pd.DataFrame({"A": ["c", "b", "a"]}) + data3 = pd.DataFrame( + {"A": pd.Categorical(["c", "b", "a"], categories=["c", "b", "a"])} + ) + + m = TabmatMaterializer(data).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m.model_spec.column_names) == ["A[a]", "A[b]", "A[c]"] + + m2 = TabmatMaterializer(data2).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m2.model_spec.column_names) == ["A[a]", "A[b]", "A[c]"] + + m3 = TabmatMaterializer(data3).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m3.model_spec.column_names) == ["A[c]", "A[b]", "A[a]"] + + def test_term_clustering(self, materializer): + assert materializer.get_model_matrix( + "a + b + a:A + b:A" + ).model_spec.column_names == ( + "Intercept", + "a", + "b", + "a:A[b]", + "a:A[c]", + "b:A[b]", + "b:A[c]", + ) + assert materializer.get_model_matrix( + "a + b + a:A + b:A", cluster_by="numerical_factors" + ).model_spec.column_names == ( + "Intercept", + "a", + "a:A[b]", + "a:A[c]", + "b", + "b:A[b]", + "b:A[c]", + ) + + def test_model_spec_pickleable(self, materializer): + o = BytesIO() + ms = materializer.get_model_matrix("a ~ a:A") + pickle.dump(ms.model_spec, o) + o.seek(0) + ms2 = pickle.load(o) + assert isinstance(ms, formulaic.parser.types.Structured) + assert ms2.lhs.formula.root == ["a"] diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 81acc658..50f32b3c 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -24,7 +24,7 @@ def dense_matrix_C() -> tm.DenseMatrix: def dense_matrix_not_writeable() -> tm.DenseMatrix: mat = dense_matrix_F() - mat.setflags(write=False) + mat._array.setflags(write=False) return mat @@ -233,7 +233,7 @@ def test_to_array_standardized_mat(mat: tm.StandardizedMatrix): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "other_type", - [lambda x: x, np.asarray, tm.DenseMatrix], + [lambda x: x, np.asarray], ) @pytest.mark.parametrize("cols", [None, [], [1], np.array([1])]) @pytest.mark.parametrize("other_shape", [[], [1], [2]]) @@ -243,7 +243,7 @@ def test_matvec( """ Mat. - other_type: Function transforming list to list, array, or DenseMatrix + t: Function transforming list to list, array, or DenseMatrix cols: Argument 1 to matvec, specifying which columns of the matrix (and which elements of 'other') to use other_shape: Second dimension of 'other.shape', if any. If other_shape is [], then @@ -303,7 +303,7 @@ def process_mat_vec_subsets(mat, vec, mat_rows, mat_cols, vec_idxs): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "other_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) @pytest.mark.parametrize("rows", [None, [], [2], np.arange(2)]) @pytest.mark.parametrize("cols", [None, [], [1], np.arange(1)]) @@ -373,7 +373,7 @@ def test_cross_sandwich( @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "vec_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) @pytest.mark.parametrize("rows", [None, [], [1], np.arange(2)]) @pytest.mark.parametrize("cols", [None, [], [0], np.arange(1)]) @@ -430,7 +430,7 @@ def test_transpose(mat): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "vec_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) def test_rmatmul(mat: Union[tm.MatrixBase, tm.StandardizedMatrix], vec_type): vec_as_list = [3.0, -0.1, 0] @@ -440,7 +440,7 @@ def test_rmatmul(mat: Union[tm.MatrixBase, tm.StandardizedMatrix], vec_type): expected = vec_as_list @ mat.A np.testing.assert_allclose(res, expected) np.testing.assert_allclose(res2, expected) - assert isinstance(res, np.ndarray) + assert isinstance(res, (np.ndarray, tm.DenseMatrix)) @pytest.mark.parametrize("mat", get_matrices()) @@ -552,17 +552,69 @@ def test_indexing_int_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): res = mat[0, :] if not isinstance(res, np.ndarray): res = res.A - expected = mat.A[0, :] - np.testing.assert_allclose(np.squeeze(res), expected) + expected = mat.A[[0], :] + np.testing.assert_allclose(res, expected) @pytest.mark.parametrize("mat", get_matrices()) def test_indexing_range_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): res = mat[0:2, :] + assert res.ndim == 2 if not isinstance(res, np.ndarray): res = res.A expected = mat.A[0:2, :] - np.testing.assert_allclose(np.squeeze(res), expected) + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_int_col(mat): + res = mat[:, 0] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (mat.shape[0], 1) + expected = mat.A[:, [0]] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_range_col(mat): + res = mat[:, 0:2] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (mat.shape[0], 2) + expected = mat.A[:, 0:2] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_int_both(mat): + res = mat[0, 0] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (1, 1) + expected = mat.A[0, 0] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_seq_both(mat): + res = mat[[0, 1], [0, 1]] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (2, 2) + expected = mat.A[np.ix_([0, 1], [0, 1])] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_ix_both(mat): + indexer = np.ix_([0, 1], [0, 1]) + res = mat[indexer] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (2, 2) + expected = mat.A[indexer] + np.testing.assert_array_equal(res, expected) def test_pandas_to_matrix(): @@ -631,3 +683,215 @@ def test_multiply(mat): for act in actual: assert isinstance(act, MatrixBase) np.testing.assert_allclose(act.A, expected) + + +@pytest.mark.parametrize( + "mat_1", + get_all_matrix_base_subclass_mats() + + [base_array()] + + [sps.csc_matrix(base_array())], +) +@pytest.mark.parametrize( + "mat_2", + get_all_matrix_base_subclass_mats() + + [base_array()] + + [sps.csc_matrix(base_array())], +) +def test_hstack(mat_1, mat_2): + mats = [mat_1, mat_2] + stacked = tm.hstack(mats) + + if all(isinstance(mat, (np.ndarray, tm.DenseMatrix)) for mat in mats): + assert isinstance(stacked, tm.DenseMatrix) + elif all(isinstance(mat, (sps.csc_matrix, tm.SparseMatrix)) for mat in mats): + assert isinstance(stacked, tm.SparseMatrix) + else: + assert isinstance(stacked, tm.SplitMatrix) + + np.testing.assert_array_equal( + stacked.A, + np.hstack([mat.A if not isinstance(mat, np.ndarray) else mat for mat in mats]), + ) + + +def test_names_against_expectation(): + X = tm.DenseMatrix( + np.ones((5, 2)), column_names=["a", None], term_names=["a", None] + ) + Xc = tm.CategoricalMatrix( + pd.Categorical(["a", "b", "c", "b", "a"]), column_name="c", term_name="c" + ) + Xc2 = tm.CategoricalMatrix(pd.Categorical(["a", "b", "c", "b", "a"])) + Xs = tm.SparseMatrix( + sps.csc_matrix(np.ones((5, 2))), + column_names=["s1", "s2"], + term_names=["s", "s"], + ) + + mat = tm.SplitMatrix(matrices=[X, Xc, Xc2, Xs]) + + assert mat.get_names(type="column") == [ + "a", + None, + "c[a]", + "c[b]", + "c[c]", + None, + None, + None, + "s1", + "s2", + ] + + assert mat.get_names(type="term") == [ + "a", + None, + "c", + "c", + "c", + None, + None, + None, + "s", + "s", + ] + + assert mat.get_names(type="column", missing_prefix="_col_") == [ + "a", + "_col_1", + "c[a]", + "c[b]", + "c[c]", + "_col_5-7[a]", + "_col_5-7[b]", + "_col_5-7[c]", + "s1", + "s2", + ] + + assert mat.get_names(type="term", missing_prefix="_col_") == [ + "a", + "_col_1", + "c", + "c", + "c", + "_col_5-7", + "_col_5-7", + "_col_5-7", + "s", + "s", + ] + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("missing_prefix", ["_col_", "X"]) +def test_names_getter_setter(mat, missing_prefix): + names = mat.get_names(missing_prefix=missing_prefix, type="column") + mat.column_names = names + assert mat.column_names == names + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("missing_prefix", ["_col_", "X"]) +def test_terms_getter_setter(mat, missing_prefix): + names = mat.get_names(missing_prefix=missing_prefix, type="term") + mat.term_names = names + assert mat.term_names == names + + +@pytest.mark.parametrize("indexer_1", [slice(None, None), 0, slice(2, 8)]) +@pytest.mark.parametrize("indexer_2", [[0], slice(1, 4), [0, 2, 3], [4, 3, 2, 1, 0]]) +@pytest.mark.parametrize("sparse", [True, False]) +def test_names_indexing(indexer_1, indexer_2, sparse): + X = np.ones((10, 5), dtype=np.float64) + colnames = ["a", "b", None, "d", "e"] + termnames = ["t1", "t1", None, "t4", "t5"] + + colnames_array = np.array(colnames) + termnames_array = np.array(termnames) + + if sparse: + X = tm.SparseMatrix( + sps.csc_matrix(X), column_names=colnames, term_names=termnames + ) + else: + X = tm.DenseMatrix(X, column_names=colnames, term_names=termnames) + + X_indexed = X[indexer_1, indexer_2] + if not isinstance(X_indexed, tm.MatrixBase): + pytest.skip("Does not return MatrixBase") + assert X_indexed.column_names == list(colnames_array[indexer_2]) + assert X_indexed.term_names == list(termnames_array[indexer_2]) + + +@pytest.mark.parametrize("mat_1", get_all_matrix_base_subclass_mats()) +@pytest.mark.parametrize("mat_2", get_all_matrix_base_subclass_mats()) +def test_combine_names(mat_1, mat_2): + mat_1.column_names = mat_1.get_names(missing_prefix="m1_", type="column") + mat_2.column_names = mat_2.get_names(missing_prefix="m2_", type="column") + + mat_1.term_names = mat_1.get_names(missing_prefix="m1_", type="term") + mat_2.term_names = mat_2.get_names(missing_prefix="m2_", type="term") + + combined = tm.SplitMatrix(matrices=[mat_1, mat_2]) + + assert combined.column_names == mat_1.column_names + mat_2.column_names + assert combined.term_names == mat_1.term_names + mat_2.term_names + + +@pytest.mark.parametrize("prefix_sep", ["_", ": "]) +@pytest.mark.parametrize("drop_first", [True, False]) +def test_names_pandas(prefix_sep, drop_first): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows) + + dense_ser = pd.Series(dense_column) + lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) + sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) + cat_ser_lowdim = pd.Categorical(cat_column_lowdim) + cat_ser_highdim = pd.Categorical(cat_column_highdim) + + df = pd.DataFrame( + data={ + "d": dense_ser, + "cl_obj": cat_ser_lowdim.astype(object), + "ch": cat_ser_highdim, + "ds": lowdense_ser, + "s": sparse_ser, + } + ) + + categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="end", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first) + assert mat_end.column_names == expanded_df.columns.tolist() + + mat_expand = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="expand", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + unique_terms = list(dict.fromkeys(mat_expand.term_names)) + assert unique_terms == df.columns.tolist() diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py index 6e97022c..89b6e7d2 100644 --- a/tests/test_split_matrix.py +++ b/tests/test_split_matrix.py @@ -7,7 +7,7 @@ import tabmat as tm from tabmat import from_pandas -from tabmat.constructor import _split_sparse_and_dense_parts +from tabmat.constructor_util import _split_sparse_and_dense_parts from tabmat.dense_matrix import DenseMatrix from tabmat.ext.sparse import csr_dense_sandwich from tabmat.split_matrix import SplitMatrix @@ -61,27 +61,32 @@ def split_mat() -> SplitMatrix: return mat -def get_split_with_cat_components() -> ( - list[Union[tm.SparseMatrix, tm.DenseMatrix, tm.CategoricalMatrix]] -): +def get_split_with_cat_components( + missing, +) -> list[Union[tm.SparseMatrix, tm.DenseMatrix, tm.CategoricalMatrix]]: n_rows = 10 np.random.seed(0) dense_1 = tm.DenseMatrix(np.random.random((n_rows, 3))) sparse_1 = tm.SparseMatrix(sps.random(n_rows, 3).tocsc()) - cat = tm.CategoricalMatrix(np.random.choice(range(3), n_rows)) + if missing: + cat = tm.CategoricalMatrix( + np.random.choice([0, 1, 2, None], n_rows), cat_missing_method="zero" + ) + else: + cat = tm.CategoricalMatrix(np.random.choice(range(3), n_rows)) dense_2 = tm.DenseMatrix(np.random.random((n_rows, 3))) sparse_2 = tm.SparseMatrix(sps.random(n_rows, 3, density=0.5).tocsc()) cat_2 = tm.CategoricalMatrix(np.random.choice(range(3), n_rows), drop_first=True) return [dense_1, sparse_1, cat, dense_2, sparse_2, cat_2] -def split_with_cat() -> SplitMatrix: +def split_with_cat(missing) -> SplitMatrix: """Initialized with multiple sparse and dense parts and no indices.""" - return tm.SplitMatrix(get_split_with_cat_components()) + return tm.SplitMatrix(get_split_with_cat_components(missing)) -def split_with_cat_64() -> SplitMatrix: - mat = tm.SplitMatrix(get_split_with_cat_components()) +def split_with_cat_64(missing) -> SplitMatrix: + mat = tm.SplitMatrix(get_split_with_cat_components(missing)) matrices = mat.matrices for i, mat_ in enumerate(mat.matrices): @@ -99,7 +104,15 @@ def split_with_cat_64() -> SplitMatrix: return tm.SplitMatrix(matrices, mat.indices) -@pytest.mark.parametrize("mat", [split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) def test_init(mat: SplitMatrix): assert len(mat.indices) == 4 assert len(mat.matrices) == 4 @@ -109,7 +122,15 @@ def test_init(mat: SplitMatrix): assert mat.matrices[2].shape == (10, 3) -@pytest.mark.parametrize("mat", [split_mat(), split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) def test_init_from_split(mat): np.testing.assert_array_equal(mat.A, tm.SplitMatrix([mat]).A) np.testing.assert_array_equal( @@ -146,7 +167,15 @@ def test_sandwich_sparse_dense(X: np.ndarray, Acols, Bcols): # TODO: ensure cols are in order -@pytest.mark.parametrize("mat", [split_mat(), split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) @pytest.mark.parametrize( "cols", [None, [0], [1, 2, 3], [1, 5]], @@ -160,7 +189,15 @@ def test_sandwich(mat: tm.SplitMatrix, cols): np.testing.assert_allclose(y1, y2, atol=1e-12) -@pytest.mark.parametrize("mat", [split_mat(), split_with_cat(), split_with_cat_64()]) +@pytest.mark.parametrize( + "mat", + [ + split_with_cat(False), + split_with_cat_64(False), + split_with_cat(True), + split_with_cat_64(True), + ], +) @pytest.mark.parametrize("cols", [None, [0], [1, 2, 3], [1, 5]]) def test_split_col_subsets(mat: tm.SplitMatrix, cols): subset_cols_indices, subset_cols, n_cols = mat._split_col_subsets(cols) @@ -189,56 +226,66 @@ def _get_lengths(vec_list: list[Optional[np.ndarray]]): assert (mat.indices[i] == subset_cols_indices[i]).all() -def random_split_matrix(seed=0, n_rows=10, n_cols_per=3): +def random_split_matrix(seed=0, n_rows=10, n_cols_per=3, missing=False): if seed is not None: np.random.seed(seed) dense_1 = tm.DenseMatrix(np.random.random((n_rows, n_cols_per))) sparse = tm.SparseMatrix(sps.random(n_rows, n_cols_per).tocsc()) - cat = tm.CategoricalMatrix(np.random.choice(range(n_cols_per), n_rows)) + if missing: + cat = tm.CategoricalMatrix( + np.random.choice(list(range(n_cols_per)) + [None], n_rows), + cat_missing_method="zero", + ) + else: + cat = tm.CategoricalMatrix(np.random.choice(range(n_cols_per), n_rows)) dense_2 = tm.DenseMatrix(np.random.random((n_rows, n_cols_per))) cat_2 = tm.CategoricalMatrix(np.random.choice(range(n_cols_per), n_rows)) mat = tm.SplitMatrix([dense_1, sparse, cat, dense_2, cat_2]) return mat -def many_random_tests(checker): +def many_random_tests(checker, missing): for i in range(10): mat = random_split_matrix( seed=(1 if i == 0 else None), n_rows=np.random.randint(130), n_cols_per=1 + np.random.randint(10), + missing=missing, ) checker(mat) -def test_sandwich_many_types(): +@pytest.mark.parametrize("missing", [False, True], ids=["no_missing", "missing"]) +def test_sandwich_many_types(missing): def check(mat): d = np.random.random(mat.shape[0]) res = mat.sandwich(d) expected = (mat.A.T * d[None, :]) @ mat.A np.testing.assert_allclose(res, expected) - many_random_tests(check) + many_random_tests(check, missing) -def test_transpose_matvec_many_types(): +@pytest.mark.parametrize("missing", [False, True], ids=["no_missing", "missing"]) +def test_transpose_matvec_many_types(missing): def check(mat): d = np.random.random(mat.shape[0]) res = mat.transpose_matvec(d) expected = mat.A.T.dot(d) np.testing.assert_almost_equal(res, expected) - many_random_tests(check) + many_random_tests(check, missing) -def test_matvec_many_types(): +@pytest.mark.parametrize("missing", [False, True], ids=["no_missing", "missing"]) +def test_matvec_many_types(missing): def check(mat): d = np.random.random(mat.shape[1]) res = mat.matvec(d) expected = mat.A.dot(d) np.testing.assert_almost_equal(res, expected) - many_random_tests(check) + many_random_tests(check, missing) def test_init_from_1d(): @@ -259,3 +306,17 @@ def test_matvec(n_rows): ) mat = from_pandas(X, cat_threshold=0) np.testing.assert_allclose(mat.matvec(np.array(mat.shape[1] * [1])), n_cols) + + +@pytest.mark.parametrize("cat_missing_method", ["fail", "zero", "convert"]) +def test_from_pandas_missing(cat_missing_method): + df = pd.DataFrame({"cat": pd.Categorical([1, 2, pd.NA, 1, 2, pd.NA])}) + if cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + from_pandas(df, cat_missing_method=cat_missing_method) + elif cat_missing_method == "zero": + assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 2) + elif cat_missing_method == "convert": + assert from_pandas(df, cat_missing_method=cat_missing_method).shape == (6, 3)