diff --git a/docs/api_reference/_autosummary/xeofs.models.CPCCARotator.rst b/docs/api_reference/_autosummary/xeofs.models.CPCCARotator.rst new file mode 100644 index 0000000..d40d1e5 --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.models.CPCCARotator.rst @@ -0,0 +1,47 @@ +xeofs.models.CPCCARotator +========================= + +.. currentmodule:: xeofs.models + +.. autoclass:: CPCCARotator + :members: + :show-inheritance: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~CPCCARotator.__init__ + ~CPCCARotator.components + ~CPCCARotator.compute + ~CPCCARotator.correlation_coefficients_X + ~CPCCARotator.correlation_coefficients_Y + ~CPCCARotator.cross_correlation_coefficients + ~CPCCARotator.deserialize + ~CPCCARotator.fit + ~CPCCARotator.fraction_variance_X_explained_by_X + ~CPCCARotator.fraction_variance_Y_explained_by_X + ~CPCCARotator.fraction_variance_Y_explained_by_Y + ~CPCCARotator.get_params + ~CPCCARotator.get_serialization_attrs + ~CPCCARotator.heterogeneous_patterns + ~CPCCARotator.homogeneous_patterns + ~CPCCARotator.inverse_transform + ~CPCCARotator.load + ~CPCCARotator.predict + ~CPCCARotator.save + ~CPCCARotator.scores + ~CPCCARotator.serialize + ~CPCCARotator.squared_covariance_fraction + ~CPCCARotator.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/_autosummary/xeofs.models.ComplexCPCCA.rst b/docs/api_reference/_autosummary/xeofs.models.ComplexCPCCA.rst new file mode 100644 index 0000000..59473ed --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.models.ComplexCPCCA.rst @@ -0,0 +1,51 @@ +xeofs.models.ComplexCPCCA +========================= + +.. currentmodule:: xeofs.models + +.. autoclass:: ComplexCPCCA + :members: + :show-inheritance: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~ComplexCPCCA.__init__ + ~ComplexCPCCA.components + ~ComplexCPCCA.components_amplitude + ~ComplexCPCCA.components_phase + ~ComplexCPCCA.compute + ~ComplexCPCCA.correlation_coefficients_X + ~ComplexCPCCA.correlation_coefficients_Y + ~ComplexCPCCA.cross_correlation_coefficients + ~ComplexCPCCA.deserialize + ~ComplexCPCCA.fit + ~ComplexCPCCA.fraction_variance_X_explained_by_X + ~ComplexCPCCA.fraction_variance_Y_explained_by_X + ~ComplexCPCCA.fraction_variance_Y_explained_by_Y + ~ComplexCPCCA.get_params + ~ComplexCPCCA.get_serialization_attrs + ~ComplexCPCCA.heterogeneous_patterns + ~ComplexCPCCA.homogeneous_patterns + ~ComplexCPCCA.inverse_transform + ~ComplexCPCCA.load + ~ComplexCPCCA.predict + ~ComplexCPCCA.save + ~ComplexCPCCA.scores + ~ComplexCPCCA.scores_amplitude + ~ComplexCPCCA.scores_phase + ~ComplexCPCCA.serialize + ~ComplexCPCCA.squared_covariance_fraction + ~ComplexCPCCA.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/_autosummary/xeofs.models.ComplexCPCCARotator.rst b/docs/api_reference/_autosummary/xeofs.models.ComplexCPCCARotator.rst new file mode 100644 index 0000000..bc04ef6 --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.models.ComplexCPCCARotator.rst @@ -0,0 +1,51 @@ +xeofs.models.ComplexCPCCARotator +================================ + +.. currentmodule:: xeofs.models + +.. autoclass:: ComplexCPCCARotator + :members: + :show-inheritance: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~ComplexCPCCARotator.__init__ + ~ComplexCPCCARotator.components + ~ComplexCPCCARotator.components_amplitude + ~ComplexCPCCARotator.components_phase + ~ComplexCPCCARotator.compute + ~ComplexCPCCARotator.correlation_coefficients_X + ~ComplexCPCCARotator.correlation_coefficients_Y + ~ComplexCPCCARotator.cross_correlation_coefficients + ~ComplexCPCCARotator.deserialize + ~ComplexCPCCARotator.fit + ~ComplexCPCCARotator.fraction_variance_X_explained_by_X + ~ComplexCPCCARotator.fraction_variance_Y_explained_by_X + ~ComplexCPCCARotator.fraction_variance_Y_explained_by_Y + ~ComplexCPCCARotator.get_params + ~ComplexCPCCARotator.get_serialization_attrs + ~ComplexCPCCARotator.heterogeneous_patterns + ~ComplexCPCCARotator.homogeneous_patterns + ~ComplexCPCCARotator.inverse_transform + ~ComplexCPCCARotator.load + ~ComplexCPCCARotator.predict + ~ComplexCPCCARotator.save + ~ComplexCPCCARotator.scores + ~ComplexCPCCARotator.scores_amplitude + ~ComplexCPCCARotator.scores_phase + ~ComplexCPCCARotator.serialize + ~ComplexCPCCARotator.squared_covariance_fraction + ~ComplexCPCCARotator.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/_autosummary/xeofs.models.ComplexMCA.rst b/docs/api_reference/_autosummary/xeofs.models.ComplexMCA.rst index 9d8c7f5..fd12cb0 100644 --- a/docs/api_reference/_autosummary/xeofs.models.ComplexMCA.rst +++ b/docs/api_reference/_autosummary/xeofs.models.ComplexMCA.rst @@ -21,24 +21,28 @@ ~ComplexMCA.components_amplitude ~ComplexMCA.components_phase ~ComplexMCA.compute - ~ComplexMCA.covariance_fraction + ~ComplexMCA.correlation_coefficients_X + ~ComplexMCA.correlation_coefficients_Y + ~ComplexMCA.covariance_fraction_CD95 + ~ComplexMCA.cross_correlation_coefficients ~ComplexMCA.deserialize ~ComplexMCA.fit + ~ComplexMCA.fraction_variance_X_explained_by_X + ~ComplexMCA.fraction_variance_Y_explained_by_X + ~ComplexMCA.fraction_variance_Y_explained_by_Y ~ComplexMCA.get_params ~ComplexMCA.get_serialization_attrs ~ComplexMCA.heterogeneous_patterns ~ComplexMCA.homogeneous_patterns ~ComplexMCA.inverse_transform ~ComplexMCA.load + ~ComplexMCA.predict ~ComplexMCA.save ~ComplexMCA.scores ~ComplexMCA.scores_amplitude ~ComplexMCA.scores_phase ~ComplexMCA.serialize - ~ComplexMCA.singular_values - ~ComplexMCA.squared_covariance ~ComplexMCA.squared_covariance_fraction - ~ComplexMCA.total_covariance ~ComplexMCA.transform diff --git a/docs/api_reference/_autosummary/xeofs.models.ComplexMCARotator.rst b/docs/api_reference/_autosummary/xeofs.models.ComplexMCARotator.rst index 4c64710..81ad13d 100644 --- a/docs/api_reference/_autosummary/xeofs.models.ComplexMCARotator.rst +++ b/docs/api_reference/_autosummary/xeofs.models.ComplexMCARotator.rst @@ -21,24 +21,28 @@ ~ComplexMCARotator.components_amplitude ~ComplexMCARotator.components_phase ~ComplexMCARotator.compute - ~ComplexMCARotator.covariance_fraction + ~ComplexMCARotator.correlation_coefficients_X + ~ComplexMCARotator.correlation_coefficients_Y + ~ComplexMCARotator.covariance_fraction_CD95 + ~ComplexMCARotator.cross_correlation_coefficients ~ComplexMCARotator.deserialize ~ComplexMCARotator.fit + ~ComplexMCARotator.fraction_variance_X_explained_by_X + ~ComplexMCARotator.fraction_variance_Y_explained_by_X + ~ComplexMCARotator.fraction_variance_Y_explained_by_Y ~ComplexMCARotator.get_params ~ComplexMCARotator.get_serialization_attrs ~ComplexMCARotator.heterogeneous_patterns ~ComplexMCARotator.homogeneous_patterns ~ComplexMCARotator.inverse_transform ~ComplexMCARotator.load + ~ComplexMCARotator.predict ~ComplexMCARotator.save ~ComplexMCARotator.scores ~ComplexMCARotator.scores_amplitude ~ComplexMCARotator.scores_phase ~ComplexMCARotator.serialize - ~ComplexMCARotator.singular_values - ~ComplexMCARotator.squared_covariance ~ComplexMCARotator.squared_covariance_fraction - ~ComplexMCARotator.total_covariance ~ComplexMCARotator.transform diff --git a/docs/api_reference/_autosummary/xeofs.models.ContinuumPowerCCA.rst b/docs/api_reference/_autosummary/xeofs.models.ContinuumPowerCCA.rst new file mode 100644 index 0000000..5939d3d --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.models.ContinuumPowerCCA.rst @@ -0,0 +1,47 @@ +xeofs.models.ContinuumPowerCCA +============================== + +.. currentmodule:: xeofs.models + +.. autoclass:: ContinuumPowerCCA + :members: + :show-inheritance: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~ContinuumPowerCCA.__init__ + ~ContinuumPowerCCA.components + ~ContinuumPowerCCA.compute + ~ContinuumPowerCCA.correlation_coefficients_X + ~ContinuumPowerCCA.correlation_coefficients_Y + ~ContinuumPowerCCA.cross_correlation_coefficients + ~ContinuumPowerCCA.deserialize + ~ContinuumPowerCCA.fit + ~ContinuumPowerCCA.fraction_variance_X_explained_by_X + ~ContinuumPowerCCA.fraction_variance_Y_explained_by_X + ~ContinuumPowerCCA.fraction_variance_Y_explained_by_Y + ~ContinuumPowerCCA.get_params + ~ContinuumPowerCCA.get_serialization_attrs + ~ContinuumPowerCCA.heterogeneous_patterns + ~ContinuumPowerCCA.homogeneous_patterns + ~ContinuumPowerCCA.inverse_transform + ~ContinuumPowerCCA.load + ~ContinuumPowerCCA.predict + ~ContinuumPowerCCA.save + ~ContinuumPowerCCA.scores + ~ContinuumPowerCCA.serialize + ~ContinuumPowerCCA.squared_covariance_fraction + ~ContinuumPowerCCA.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/_autosummary/xeofs.models.MCA.rst b/docs/api_reference/_autosummary/xeofs.models.MCA.rst index 16b58d9..6fc243d 100644 --- a/docs/api_reference/_autosummary/xeofs.models.MCA.rst +++ b/docs/api_reference/_autosummary/xeofs.models.MCA.rst @@ -19,22 +19,26 @@ ~MCA.__init__ ~MCA.components ~MCA.compute - ~MCA.covariance_fraction + ~MCA.correlation_coefficients_X + ~MCA.correlation_coefficients_Y + ~MCA.covariance_fraction_CD95 + ~MCA.cross_correlation_coefficients ~MCA.deserialize ~MCA.fit + ~MCA.fraction_variance_X_explained_by_X + ~MCA.fraction_variance_Y_explained_by_X + ~MCA.fraction_variance_Y_explained_by_Y ~MCA.get_params ~MCA.get_serialization_attrs ~MCA.heterogeneous_patterns ~MCA.homogeneous_patterns ~MCA.inverse_transform ~MCA.load + ~MCA.predict ~MCA.save ~MCA.scores ~MCA.serialize - ~MCA.singular_values - ~MCA.squared_covariance ~MCA.squared_covariance_fraction - ~MCA.total_covariance ~MCA.transform diff --git a/docs/api_reference/_autosummary/xeofs.models.MCARotator.rst b/docs/api_reference/_autosummary/xeofs.models.MCARotator.rst index 785836d..ed88ae2 100644 --- a/docs/api_reference/_autosummary/xeofs.models.MCARotator.rst +++ b/docs/api_reference/_autosummary/xeofs.models.MCARotator.rst @@ -19,22 +19,26 @@ ~MCARotator.__init__ ~MCARotator.components ~MCARotator.compute - ~MCARotator.covariance_fraction + ~MCARotator.correlation_coefficients_X + ~MCARotator.correlation_coefficients_Y + ~MCARotator.covariance_fraction_CD95 + ~MCARotator.cross_correlation_coefficients ~MCARotator.deserialize ~MCARotator.fit + ~MCARotator.fraction_variance_X_explained_by_X + ~MCARotator.fraction_variance_Y_explained_by_X + ~MCARotator.fraction_variance_Y_explained_by_Y ~MCARotator.get_params ~MCARotator.get_serialization_attrs ~MCARotator.heterogeneous_patterns ~MCARotator.homogeneous_patterns ~MCARotator.inverse_transform ~MCARotator.load + ~MCARotator.predict ~MCARotator.save ~MCARotator.scores ~MCARotator.serialize - ~MCARotator.singular_values - ~MCARotator.squared_covariance ~MCARotator.squared_covariance_fraction - ~MCARotator.total_covariance ~MCARotator.transform diff --git a/docs/api_reference/multi_set_analysis.rst b/docs/api_reference/multi_set_analysis.rst index 3cef399..8086537 100644 --- a/docs/api_reference/multi_set_analysis.rst +++ b/docs/api_reference/multi_set_analysis.rst @@ -9,9 +9,12 @@ Methods that investigate relationships or patterns between variables across two :recursive: xeofs.models.MCA + xeofs.models.CCA + xeofs.models.CPCCA xeofs.models.ComplexMCA + xeofs.models.ComplexCPCCA xeofs.models.HilbertMCA - xeofs.models.CCA + xeofs.models.HilbertCPCCA ------------------------------ Sparse Solutions via Rotation @@ -23,5 +26,8 @@ Sparse Solutions via Rotation :recursive: xeofs.models.MCARotator + xeofs.models.CPCCARotator xeofs.models.ComplexMCARotator + xeofs.models.ComplexCPCCARotator xeofs.models.HilbertMCARotator + xeofs.models.HilbertCPCCARotator diff --git a/tests/models/test_cpcca.py b/tests/models/test_cpcca.py new file mode 100644 index 0000000..5ebfe3f --- /dev/null +++ b/tests/models/test_cpcca.py @@ -0,0 +1,274 @@ +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xeofs.models.cpcca import CPCCA + + +def generate_random_data(shape, lazy=False, seed=142): + rng = np.random.default_rng(seed) + if lazy: + return xr.DataArray( + da.random.random(shape, chunks=(5, 5)), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + else: + return xr.DataArray( + rng.random(shape), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + + +def generate_well_conditioned_data(lazy=False): + rng = np.random.default_rng(142) + t = np.linspace(0, 50, 200) + std = 0.1 + x1 = np.sin(t)[:, None] + rng.normal(0, std, size=(200, 2)) + x2 = np.sin(t)[:, None] + rng.normal(0, std, size=(200, 3)) + x1[:, 1] = x1[:, 1] ** 2 + x2[:, 1] = x2[:, 1] ** 3 + x2[:, 2] = abs(x2[:, 2]) ** (0.5) + coords_time = np.arange(len(t)) + coords_fx = [1, 2] + coords_fy = [1, 2, 3] + X = xr.DataArray( + x1, + dims=["sample", "feature"], + coords={"sample": coords_time, "feature": coords_fx}, + ) + Y = xr.DataArray( + x2, + dims=["sample", "feature"], + coords={"sample": coords_time, "feature": coords_fy}, + ) + if lazy: + X = X.chunk({"sample": 5, "feature": -1}) + Y = Y.chunk({"sample": 5, "feature": -1}) + return X, Y + else: + return X, Y + + +@pytest.fixture +def cpcca(): + return CPCCA(n_modes=1) + + +def test_initialization(): + model = CPCCA() + assert model is not None + + +def test_fit(cpcca): + X, Y = generate_well_conditioned_data() + cpcca.fit(X, Y, dim="sample") + assert hasattr(cpcca, "preprocessor1") + assert hasattr(cpcca, "preprocessor2") + assert hasattr(cpcca, "data") + + +def test_fit_empty_data(cpcca): + with pytest.raises(ValueError): + cpcca.fit(xr.DataArray(), xr.DataArray(), "time") + + +def test_fit_invalid_dims(cpcca): + X, Y = generate_well_conditioned_data() + with pytest.raises(ValueError): + cpcca.fit(X, Y, dim=("invalid_dim1", "invalid_dim2")) + + +def test_transform(cpcca): + X, Y = generate_well_conditioned_data() + cpcca.fit(X, Y, dim="sample") + result = cpcca.transform(X, Y) + assert isinstance(result, list) + assert isinstance(result[0], xr.DataArray) + + +def test_transform_unseen_data(cpcca): + X, Y = generate_well_conditioned_data() + x = X.isel(sample=slice(151, 200)) + y = Y.isel(sample=slice(151, 200)) + X = X.isel(sample=slice(None, 150)) + Y = Y.isel(sample=slice(None, 150)) + + cpcca.fit(X, Y, "sample") + result = cpcca.transform(x, y) + assert isinstance(result, list) + assert isinstance(result[0], xr.DataArray) + # Check that unseen data can be transformed + assert result[0].notnull().all() + assert result[1].notnull().all() + + +def test_inverse_transform(cpcca): + X, Y = generate_well_conditioned_data() + cpcca.fit(X, Y, "sample") + # Assuming mode as 1 for simplicity + scores1 = cpcca.data["scores1"].sel(mode=1) + scores2 = cpcca.data["scores2"].sel(mode=1) + Xrec1, Xrec2 = cpcca.inverse_transform(scores1, scores2) + assert isinstance(Xrec1, xr.DataArray) + assert isinstance(Xrec2, xr.DataArray) + + +@pytest.mark.parametrize( + "alpha,use_pca", + [ + (1.0, False), + (0.5, False), + (0.0, False), + (1.0, True), + (0.5, True), + (0.0, True), + ], +) +def test_squared_covariance_fraction(alpha, use_pca): + X, Y = generate_well_conditioned_data() + cpcca = CPCCA(n_modes=2, alpha=alpha, use_pca=use_pca, n_pca_modes="all") + cpcca.fit(X, Y, "sample") + scf = cpcca.squared_covariance_fraction() + assert isinstance(scf, xr.DataArray) + assert all(scf <= 1), "Squared covariance fraction is greater than 1" + + +@pytest.mark.parametrize( + "alpha,use_pca", + [ + (1.0, False), + (0.5, False), + (0.0, False), + (1.0, True), + (0.5, True), + (0.0, True), + ], +) +def test_total_squared_covariance(alpha, use_pca): + X, Y = generate_well_conditioned_data() + + # Compute total squared covariance + X_ = X.rename({"feature": "x"}) + Y_ = Y.rename({"feature": "y"}) + cov_mat = xr.cov(X_, Y_, dim="sample") + tsc = (cov_mat**2).sum() + + cpcca = CPCCA(n_modes=2, alpha=alpha, use_pca=use_pca, n_pca_modes="all") + cpcca.fit(X, Y, "sample") + tsc_model = cpcca.data["total_squared_covariance"] + xr.testing.assert_allclose(tsc, tsc_model) + + +def test_alpha_integer(): + X, Y = generate_well_conditioned_data() + + cpcca = CPCCA(n_modes=2, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + + +def test_fit_different_coordinates(): + """Like a lagged CCA scenario""" + X, Y = generate_well_conditioned_data() + X = X.isel(sample=slice(0, 99)) + Y = Y.isel(sample=slice(100, 199)) + cpcca = CPCCA(n_modes=2, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + r = cpcca.cross_correlation_coefficients() + # Correlation coefficents are not zero + assert np.all(r > np.finfo(r.dtype).eps) + + +@pytest.mark.parametrize( + "dim", + [ + (("time",)), + (("lat", "lon")), + (("lon", "lat")), + ], +) +def test_components(mock_data_array, dim): + cpcca = CPCCA(n_modes=2, alpha=1, use_pca=False) + cpcca.fit(mock_data_array, mock_data_array, dim) + components1, components2 = cpcca.components() + feature_dims = tuple(set(mock_data_array.dims) - set(dim)) + assert isinstance(components1, xr.DataArray) + assert isinstance(components2, xr.DataArray) + assert set(components1.dims) == set( + ("mode",) + feature_dims + ), "Components1 does not have the right feature dimensions" + assert set(components2.dims) == set( + ("mode",) + feature_dims + ), "Components2 does not have the right feature dimensions" + + +@pytest.mark.parametrize( + "shapeX,shapeY,alpha,use_pca", + [ + ((20, 10), (20, 10), 1.0, False), + ((20, 40), (20, 30), 1.0, False), + ((20, 10), (20, 40), 1.0, False), + ((20, 10), (20, 10), 0.5, False), + ((20, 40), (20, 30), 0.5, False), + ((20, 10), (20, 40), 0.5, False), + ((20, 10), (20, 10), 1.0, True), + ((20, 40), (20, 30), 1.0, True), + ((20, 10), (20, 40), 1.0, True), + ((20, 10), (20, 10), 0.5, True), + ((20, 40), (20, 30), 0.5, True), + ((20, 10), (20, 40), 0.5, True), + ], +) +def test_components_coordinates(shapeX, shapeY, alpha, use_pca): + # Test that the components have the right coordinates + X = generate_random_data(shapeX) + Y = generate_random_data(shapeY) + + cpcca = CPCCA(n_modes=2, alpha=alpha, use_pca=use_pca, n_pca_modes="all") + cpcca.fit(X, Y, "sample") + components1, components2 = cpcca.components() + xr.testing.assert_equal(components1.coords["feature"], X.coords["feature"]) + xr.testing.assert_equal(components2.coords["feature"], Y.coords["feature"]) + + +@pytest.mark.parametrize( + "correction", + [(None), ("fdr_bh")], +) +def test_homogeneous_patterns(correction): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + + _ = cpcca.homogeneous_patterns(correction=correction) + + +@pytest.mark.parametrize( + "correction", + [(None), ("fdr_bh")], +) +def test_heterogeneous_patterns(correction): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + + _ = cpcca.heterogeneous_patterns(correction=correction) + + +def test_predict(): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=0.2, use_pca=False) + cpcca.fit(X, Y, "sample") + + Xnew = generate_random_data((200, 10), seed=123) + + Ry_pred = cpcca.predict(Xnew) + _ = cpcca.inverse_transform(Y=Ry_pred) diff --git a/tests/models/test_cpcca_complex_rotator.py b/tests/models/test_cpcca_complex_rotator.py new file mode 100644 index 0000000..088e330 --- /dev/null +++ b/tests/models/test_cpcca_complex_rotator.py @@ -0,0 +1,67 @@ +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xeofs.models import HilbertCPCCA, HilbertCPCCARotator + + +def generate_random_data(shape, lazy=False, seed=142): + rng = np.random.default_rng(seed) + if lazy: + return xr.DataArray( + da.random.random(shape, chunks=(5, 5)), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + else: + return xr.DataArray( + rng.random(shape), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + + +def test_transform_raises(): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = HilbertCPCCA(n_modes=10, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + + rotator = HilbertCPCCARotator(n_modes=4) + rotator.fit(cpcca) + + with pytest.raises(NotImplementedError): + rotator.transform() + + +@pytest.mark.parametrize( + "alpha,use_pca", + [ + (1.0, False), + (0.5, False), + (0.0, False), + (1.0, True), + (0.5, True), + (0.0, True), + ], +) +def test_squared_covariance_fraction_conserved(alpha, use_pca): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = HilbertCPCCA(n_modes=10, alpha=alpha, use_pca=use_pca, n_pca_modes="all") + cpcca.fit(X, Y, "sample") + + n_rot_modes = 5 + rotator = HilbertCPCCARotator(n_modes=n_rot_modes, power=1) + rotator.fit(cpcca) + + scf = rotator.squared_covariance_fraction() + scf_rot = rotator.squared_covariance_fraction() + + scf_sum = scf.sel(mode=slice(1, n_rot_modes)).sum() + scf_rot_sum = scf_rot.sel(mode=slice(1, n_rot_modes)).sum() + + xr.testing.assert_allclose(scf_sum, scf_rot_sum) diff --git a/tests/models/test_cpcca_rotator.py b/tests/models/test_cpcca_rotator.py new file mode 100644 index 0000000..ce1c7c4 --- /dev/null +++ b/tests/models/test_cpcca_rotator.py @@ -0,0 +1,112 @@ +import dask.array.random as da +import numpy as np +import pytest +import xarray as xr + +from xeofs.models import CPCCA, CPCCARotator + + +def generate_random_data(shape, lazy=False, seed=142): + rng = np.random.default_rng(seed) + if lazy: + return xr.DataArray( + da.random(shape, chunks=(5, 5)), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + else: + return xr.DataArray( + rng.random(shape), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + + +@pytest.mark.parametrize( + "correction", + [(None), ("fdr_bh")], +) +def test_homogeneous_patterns(correction): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + + rotator = CPCCARotator(n_modes=4) + rotator.fit(cpcca) + + _ = rotator.homogeneous_patterns(correction=correction) + + +@pytest.mark.parametrize( + "correction", + [(None), ("fdr_bh")], +) +def test_heterogeneous_patterns(correction): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=1, use_pca=False) + cpcca.fit(X, Y, "sample") + + rotator = CPCCARotator(n_modes=4) + rotator.fit(cpcca) + + _ = rotator.heterogeneous_patterns(correction=correction) + + +@pytest.mark.parametrize( + "alpha,use_pca", + [ + (1.0, False), + (0.5, False), + (0.0, False), + (1.0, True), + (0.5, True), + (0.0, True), + ], +) +def test_squared_covariance_fraction(alpha, use_pca): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=alpha, use_pca=use_pca, n_pca_modes="all") + cpcca.fit(X, Y, "sample") + rotator = CPCCARotator(n_modes=10) + rotator.fit(cpcca) + + scf = rotator.squared_covariance_fraction() + assert isinstance(scf, xr.DataArray) + assert all(scf <= 1), "Squared covariance fraction is greater than 1" + + +@pytest.mark.parametrize( + "alpha,use_pca", + [ + (1.0, False), + (0.5, False), + (0.0, False), + (1.0, True), + (0.5, True), + (0.0, True), + ], +) +def test_squared_covariance_fraction_conserved(alpha, use_pca): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cpcca = CPCCA(n_modes=10, alpha=alpha, use_pca=use_pca, n_pca_modes="all") + cpcca.fit(X, Y, "sample") + + n_rot_modes = 5 + rotator = CPCCARotator(n_modes=n_rot_modes, power=1) + rotator.fit(cpcca) + + scf = rotator.squared_covariance_fraction() + scf_rot = rotator.squared_covariance_fraction() + + scf_sum = scf.sel(mode=slice(1, n_rot_modes)).sum() + scf_rot_sum = scf_rot.sel(mode=slice(1, n_rot_modes)).sum() + + xr.testing.assert_allclose(scf_sum, scf_rot_sum) diff --git a/tests/models/test_hilbert_mca.py b/tests/models/test_hilbert_mca.py index 3245116..d0cf9bc 100644 --- a/tests/models/test_hilbert_mca.py +++ b/tests/models/test_hilbert_mca.py @@ -29,21 +29,6 @@ def test_fit(mca_model, mock_data_array, dim): assert hasattr(mca_model, "data") -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_squared_covariance(mca_model, mock_data_array, dim): - mca_model.fit(mock_data_array, mock_data_array, dim) - squared_covariance = mca_model.squared_covariance() - assert isinstance(squared_covariance, xr.DataArray) - assert (squared_covariance > 0).all() - - @pytest.mark.parametrize( "dim", [ @@ -57,23 +42,9 @@ def test_squared_covariance_fraction(mca_model, mock_data_array, dim): squared_covariance_fraction = mca_model.squared_covariance_fraction() assert isinstance(squared_covariance_fraction, xr.DataArray) assert (squared_covariance_fraction > 0).all() - assert squared_covariance_fraction.sum("mode") <= 1 - - -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_singular_values(mca_model, mock_data_array, dim): - mca_model.fit(mock_data_array, mock_data_array, dim) - n_modes = mca_model.get_params()["n_modes"] - svals = mca_model.singular_values() - assert isinstance(svals, xr.DataArray) - assert svals.size == n_modes + assert all( + squared_covariance_fraction <= 1 + ), "Squared covariance fraction is greater than 1" @pytest.mark.parametrize( @@ -86,9 +57,9 @@ def test_singular_values(mca_model, mock_data_array, dim): ) def test_covariance_fraction(mca_model, mock_data_array, dim): mca_model.fit(mock_data_array, mock_data_array, dim) - cf = mca_model.covariance_fraction() + cf = mca_model.covariance_fraction_CD95() assert isinstance(cf, xr.DataArray) - assert cf.sum("mode") <= 1.00001, "Covariance fraction is greater than 1" + assert all(cf <= 1), "Squared covariance fraction is greater than 1" @pytest.mark.parametrize( @@ -231,18 +202,6 @@ def test_transform_not_implemented(mca_model, mock_data_array, dim): mca_model.transform(mock_data_array, mock_data_array) -def test_homogeneous_patterns_not_implemented(): - mca = HilbertMCA() - with pytest.raises(NotImplementedError): - mca.homogeneous_patterns() - - -def test_heterogeneous_patterns_not_implemented(): - mca = HilbertMCA() - with pytest.raises(NotImplementedError): - mca.heterogeneous_patterns() - - @pytest.mark.parametrize( "dim", [ diff --git a/tests/models/test_hilbert_mca_rotator.py b/tests/models/test_hilbert_mca_rotator.py index 6da518d..3b7d066 100644 --- a/tests/models/test_hilbert_mca_rotator.py +++ b/tests/models/test_hilbert_mca_rotator.py @@ -26,7 +26,6 @@ def test_init(): assert mca_rotator._params["power"] == 1 assert mca_rotator._params["max_iter"] == 1000 assert mca_rotator._params["rtol"] == 1e-8 - assert mca_rotator._params["squared_loadings"] is False @pytest.mark.parametrize( @@ -58,7 +57,7 @@ def test_transform(mca_model, mock_data_array): mca_rotator.fit(mca_model) with pytest.raises(NotImplementedError): - mca_rotator.transform(data1=mock_data_array, data2=mock_data_array) + mca_rotator.transform(X=mock_data_array, Y=mock_data_array) @pytest.mark.parametrize( @@ -78,24 +77,10 @@ def test_inverse_transform(mca_model): reconstructed_data = mca_rotator.inverse_transform(scores1, scores2) - assert isinstance(reconstructed_data, tuple) + assert isinstance(reconstructed_data, list) assert len(reconstructed_data) == 2 -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_squared_covariance(mca_model, mock_data_array, dim): - mca_model.fit(mock_data_array, mock_data_array, dim) - squared_covariance = mca_model.squared_covariance() - assert isinstance(squared_covariance, xr.DataArray) - - @pytest.mark.parametrize( "dim", [ @@ -110,23 +95,6 @@ def test_squared_covariance_fraction(mca_model, mock_data_array, dim): assert isinstance(squared_covariance_fraction, xr.DataArray) -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_singular_values(mca_model): - mca_rotator = HilbertMCARotator(n_modes=4) - mca_rotator.fit(mca_model) - n_modes = mca_rotator.get_params()["n_modes"] - svals = mca_rotator.singular_values() - assert isinstance(svals, xr.DataArray) - assert svals.size == n_modes - - @pytest.mark.parametrize( "dim", [ @@ -138,9 +106,9 @@ def test_singular_values(mca_model): def test_covariance_fraction(mca_model): mca_rotator = HilbertMCARotator(n_modes=4) mca_rotator.fit(mca_model) - cf = mca_rotator.covariance_fraction() + cf = mca_rotator.covariance_fraction_CD95() assert isinstance(cf, xr.DataArray) - assert cf.sum("mode") <= 1.00001, "Covariance fraction is greater than 1" + assert all(cf <= 1), "Squared covariance fraction is greater than 1" @pytest.mark.parametrize( @@ -192,8 +160,7 @@ def test_scores(mca_model, mock_data_array, dim): def test_homogeneous_patterns(mca_model, mock_data_array, dim): mca_rotator = HilbertMCARotator(n_modes=2) mca_rotator.fit(mca_model) - with pytest.raises(NotImplementedError): - _ = mca_rotator.homogeneous_patterns() + mca_rotator.homogeneous_patterns() @pytest.mark.parametrize( @@ -207,8 +174,7 @@ def test_homogeneous_patterns(mca_model, mock_data_array, dim): def test_heterogeneous_patterns(mca_model, mock_data_array, dim): mca_rotator = HilbertMCARotator(n_modes=2) mca_rotator.fit(mca_model) - with pytest.raises(NotImplementedError): - _ = mca_rotator.heterogeneous_patterns() + mca_rotator.heterogeneous_patterns() @pytest.mark.parametrize( diff --git a/tests/models/test_mca.py b/tests/models/test_mca.py index 09aba4d..6216ced 100644 --- a/tests/models/test_mca.py +++ b/tests/models/test_mca.py @@ -1,8 +1,9 @@ -import pytest import numpy as np +import pytest import xarray as xr from xeofs.models.mca import MCA + from ..utilities import data_is_dask @@ -99,7 +100,7 @@ def test_fit_with_dataarray_list(mca_model, mock_data_array_list, dim): ) def test_transform(mca_model, mock_data_array, dim): mca_model.fit(mock_data_array, mock_data_array, dim) - result = mca_model.transform(data1=mock_data_array, data2=mock_data_array) + result = mca_model.transform(X=mock_data_array, Y=mock_data_array) assert isinstance(result, list) assert isinstance(result[0], xr.DataArray) @@ -110,7 +111,7 @@ def test_transform_unseen_data(mca_model, mock_data_array, dim): data_unseen = mock_data_array.isel(time=slice(21, None)) mca_model.fit(data, data, dim) - result = mca_model.transform(data1=data_unseen, data2=data_unseen) + result = mca_model.transform(X=data_unseen, Y=data_unseen) assert isinstance(result, list) assert isinstance(result[0], xr.DataArray) # Check that unseen data can be transformed @@ -136,20 +137,6 @@ def test_inverse_transform(mca_model, mock_data_array, dim): assert isinstance(Xrec2, xr.DataArray) -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_squared_covariance(mca_model, mock_data_array, dim): - mca_model.fit(mock_data_array, mock_data_array, dim) - squared_covariance = mca_model.squared_covariance() - assert isinstance(squared_covariance, xr.DataArray) - - @pytest.mark.parametrize( "dim", [ @@ -162,23 +149,7 @@ def test_squared_covariance_fraction(mca_model, mock_data_array, dim): mca_model.fit(mock_data_array, mock_data_array, dim) scf = mca_model.squared_covariance_fraction() assert isinstance(scf, xr.DataArray) - assert scf.sum("mode") <= 1.00001, "Squared covariance fraction is greater than 1" - - -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_singular_values(mca_model, mock_data_array, dim): - mca_model.fit(mock_data_array, mock_data_array, dim) - n_modes = mca_model.get_params()["n_modes"] - svals = mca_model.singular_values() - assert isinstance(svals, xr.DataArray) - assert svals.size == n_modes + assert all(scf <= 1), "Squared covariance fraction is greater than 1" @pytest.mark.parametrize( @@ -191,9 +162,9 @@ def test_singular_values(mca_model, mock_data_array, dim): ) def test_covariance_fraction(mca_model, mock_data_array, dim): mca_model.fit(mock_data_array, mock_data_array, dim) - cf = mca_model.covariance_fraction() + cf = mca_model.covariance_fraction_CD95() assert isinstance(cf, xr.DataArray) - assert cf.sum("mode") <= 1.00001, "Covariance fraction is greater than 1" + assert all(cf <= 1), "Squared covariance fraction is greater than 1" @pytest.mark.parametrize( @@ -383,16 +354,16 @@ def test_heterogeneous_patterns(mca_model, mock_data_array, dim): ], ) def test_compute(mock_dask_data_array, dim, compute): - mca_model = MCA(n_modes=10, compute=compute) + mca_model = MCA(n_modes=10, compute=compute, n_pca_modes=10) mca_model.fit(mock_dask_data_array, mock_dask_data_array, (dim)) if compute: - assert not data_is_dask(mca_model.data["squared_covariance"]) + assert not data_is_dask(mca_model.data["singular_values"]) assert not data_is_dask(mca_model.data["components1"]) assert not data_is_dask(mca_model.data["components2"]) else: - assert data_is_dask(mca_model.data["squared_covariance"]) + assert data_is_dask(mca_model.data["singular_values"]) assert data_is_dask(mca_model.data["components1"]) assert data_is_dask(mca_model.data["components2"]) diff --git a/tests/models/test_mca_rotator.py b/tests/models/test_mca_rotator.py index 6119854..f53db1b 100644 --- a/tests/models/test_mca_rotator.py +++ b/tests/models/test_mca_rotator.py @@ -1,22 +1,23 @@ -import pytest import numpy as np +import pytest import xarray as xr # Import the classes from your modules from xeofs.models import MCA, MCARotator + from ..utilities import data_is_dask @pytest.fixture def mca_model(mock_data_array, dim): - mca = MCA(n_modes=5) + mca = MCA(n_modes=5, use_pca=False) mca.fit(mock_data_array, mock_data_array, dim) return mca @pytest.fixture def mca_model_delayed(mock_dask_data_array, dim): - mca = MCA(n_modes=5, compute=False, check_nans=False) + mca = MCA(n_modes=5, compute=False, check_nans=False, use_pca=False) mca.fit(mock_dask_data_array, mock_dask_data_array, dim) return mca @@ -27,7 +28,7 @@ def test_init(): assert mca_rotator._params["power"] == 1 assert mca_rotator._params["max_iter"] == 1000 assert mca_rotator._params["rtol"] == 1e-8 - assert mca_rotator._params["squared_loadings"] is False + # assert mca_rotator._params["squared_loadings"] is False @pytest.mark.parametrize( @@ -58,7 +59,7 @@ def test_transform(mca_model, mock_data_array): mca_rotator = MCARotator(n_modes=4) mca_rotator.fit(mca_model) - projections = mca_rotator.transform(data1=mock_data_array, data2=mock_data_array) + projections = mca_rotator.transform(X=mock_data_array, Y=mock_data_array) assert len(projections) == 2 @@ -80,25 +81,10 @@ def test_inverse_transform(mca_model): reconstructed_data = mca_rotator.inverse_transform(scores1, scores2) - assert isinstance(reconstructed_data, tuple) + assert isinstance(reconstructed_data, list) assert len(reconstructed_data) == 2 -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_squared_covariance(mca_model, mock_data_array, dim): - mca_rotator = MCARotator(n_modes=4) - mca_rotator.fit(mca_model) - covariance_fraction = mca_rotator.squared_covariance() - assert isinstance(covariance_fraction, xr.DataArray) - - @pytest.mark.parametrize( "dim", [ @@ -114,23 +100,6 @@ def test_squared_covariance_fraction(mca_model, mock_data_array, dim): assert isinstance(squared_covariance_fraction, xr.DataArray) -@pytest.mark.parametrize( - "dim", - [ - (("time",)), - (("lat", "lon")), - (("lon", "lat")), - ], -) -def test_singular_values(mca_model): - mca_rotator = MCARotator(n_modes=4) - mca_rotator.fit(mca_model) - n_modes = mca_rotator.get_params()["n_modes"] - svals = mca_rotator.singular_values() - assert isinstance(svals, xr.DataArray) - assert svals.size == n_modes - - @pytest.mark.parametrize( "dim", [ @@ -142,9 +111,9 @@ def test_singular_values(mca_model): def test_covariance_fraction(mca_model): mca_rotator = MCARotator(n_modes=4) mca_rotator.fit(mca_model) - cf = mca_rotator.covariance_fraction() + cf = mca_rotator.covariance_fraction_CD95() assert isinstance(cf, xr.DataArray) - assert cf.sum("mode") <= 1.00001, "Covariance fraction is greater than 1" + assert all(cf <= 1), "Squared covariance fraction is greater than 1" @pytest.mark.parametrize( diff --git a/tests/models/test_orthogonality.py b/tests/models/test_orthogonality.py index cf7a911..dad46f7 100644 --- a/tests/models/test_orthogonality.py +++ b/tests/models/test_orthogonality.py @@ -13,6 +13,12 @@ ) +def is_diagonal(matrix, tol=1e-10): + # Check if all off-diagonal elements are close to zero within the specified tolerance + off_diagonal_elements = matrix - np.diag(np.diag(matrix)) + return np.all(np.abs(off_diagonal_elements) < tol) + + # Orthogonality # ============================================================================= # EOF @@ -242,8 +248,9 @@ def test_mca_scores(dim, use_coslat, mock_data_array): model.fit(data1, data2, dim=dim) U1 = model.data["scores1"].values U2 = model.data["scores2"].values - K = U1.T @ U2 - target = np.eye(K.shape[0]) * (model.data["input_data1"].sample.size - 1) + s = model.data["singular_values"].values + K = U1.T @ U2 / (U1.shape[0] - 1) + target = np.diag(s) assert np.allclose(K, target, atol=1e-5), "Scores are not orthogonal" @@ -286,40 +293,41 @@ def test_cmca_scores(dim, use_coslat, mock_data_array): """Scores are unitary""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = HilbertMCA(n_modes=10, standardize=True, use_coslat=use_coslat) + model = HilbertMCA(n_modes=6, standardize=True, use_coslat=use_coslat) model.fit(data1, data2, dim=dim) U1 = model.data["scores1"].values U2 = model.data["scores2"].values - K = U1.conj().T @ U2 - target = np.eye(K.shape[0]) * (model.data["input_data1"].sample.size - 1) + s = model.data["singular_values"].values + K = U1.conj().T @ U2 / (U1.shape[0] - 1) + target = np.diag(s) assert np.allclose(K, target, atol=1e-5), "Scores are not unitary" # Rotated MCA @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", + "dim, use_coslat, power", [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), ], ) -def test_rmca_components(dim, use_coslat, power, squared_loadings, mock_data_array): +def test_rmca_components(dim, use_coslat, power, mock_data_array): """Components are NOT orthogonal""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = MCA(n_modes=19, standardize=True, use_coslat=use_coslat) + model = MCA( + n_modes=19, + standardize=True, + use_coslat=use_coslat, + use_pca=True, + n_pca_modes=19, + ) model.fit(data1, data2, dim=dim) - rot = MCARotator(n_modes=5, power=power, squared_loadings=squared_loadings) + rot = MCARotator(n_modes=5, power=power) rot.fit(model) V1 = rot.data["components1"].values V2 = rot.data["components2"].values @@ -337,66 +345,64 @@ def test_rmca_components(dim, use_coslat, power, squared_loadings, mock_data_arr @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", + "dim, use_coslat, power", [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), ], ) -def test_rmca_scores(dim, use_coslat, power, squared_loadings, mock_data_array): +def test_rmca_scores(dim, use_coslat, power, mock_data_array): """Components are orthogonal only for Varimax rotation""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = MCA(n_modes=5, standardize=True, use_coslat=use_coslat) + model = MCA( + n_modes=5, + standardize=True, + use_coslat=use_coslat, + use_pca=False, + n_pca_modes=19, + ) model.fit(data1, data2, dim=dim) - rot = MCARotator(n_modes=5, power=power, squared_loadings=squared_loadings) + rot = MCARotator(n_modes=5, power=power) rot.fit(model) + W1 = rot.data["norm1"].values + W2 = rot.data["norm2"].values U1 = rot.data["scores1"].values U2 = rot.data["scores2"].values - K = U1.conj().T @ U2 - target = np.eye(K.shape[0]) * (model.data["input_data1"].sample.size - 1) + K = U1.T @ U2 / (U1.shape[0] - 1) + target = np.diag(W1 * W2) if power == 1: # Varimax rotation does guarantee orthogonality - assert np.allclose(K, target, atol=1e-5), "Components are not orthogonal" + np.testing.assert_allclose(K, target, atol=1e-5, rtol=1e-5) else: - assert not np.allclose(K, target), "Components are orthogonal" + assert not is_diagonal(K), "Components are orthogonal" # Hilbert Rotated MCA @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", + "dim, use_coslat, power", [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), ], ) -def test_crmca_components(dim, use_coslat, power, squared_loadings, mock_data_array): +def test_crmca_components(dim, use_coslat, power, mock_data_array): """Components are NOT orthogonal""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = HilbertMCA(n_modes=19, standardize=True, use_coslat=use_coslat) + model = HilbertMCA( + n_modes=19, standardize=True, use_coslat=use_coslat, use_pca=False + ) model.fit(data1, data2, dim=dim) - rot = HilbertMCARotator(n_modes=5, power=power, squared_loadings=squared_loadings) + rot = HilbertMCARotator(n_modes=5, power=power) rot.fit(model) V1 = rot.data["components1"].values V2 = rot.data["components2"].values @@ -414,37 +420,35 @@ def test_crmca_components(dim, use_coslat, power, squared_loadings, mock_data_ar @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", + "dim, use_coslat, power", [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), ], ) -def test_crmca_scores(dim, use_coslat, power, squared_loadings, mock_data_array): +def test_crmca_scores(dim, use_coslat, power, mock_data_array): """Components are orthogonal only for Varimax rotation""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = HilbertMCA(n_modes=5, standardize=True, use_coslat=use_coslat) + model = HilbertMCA( + n_modes=5, standardize=True, use_coslat=use_coslat, use_pca=False + ) model.fit(data1, data2, dim=dim) - rot = HilbertMCARotator(n_modes=5, power=power, squared_loadings=squared_loadings) + rot = HilbertMCARotator(n_modes=5, power=power) rot.fit(model) + W1 = rot.data["norm1"].values + W2 = rot.data["norm2"].values U1 = rot.data["scores1"].values U2 = rot.data["scores2"].values - K = U1.conj().T @ U2 - target = np.eye(K.shape[0]) * (model.data["input_data1"].sample.size - 1) + K = U1.conj().T @ U2 / (U1.shape[0] - 1) + target = np.diag(W1 * W2) if power == 1: # Varimax rotation does guarantee orthogonality - assert np.allclose(K, target, atol=1e-5), "Components are not orthogonal" + np.testing.assert_allclose(K, target, atol=1e-5, rtol=1e-5) else: assert not np.allclose(K, target), "Components are orthogonal" @@ -565,7 +569,7 @@ def test_mca_transform(dim, use_coslat, mock_data_array): model = MCA(n_modes=5, standardize=True, use_coslat=use_coslat) model.fit(data1, data2, dim=dim) scores1, scores2 = model.scores() - pseudo_scores1, pseudo_scores2 = model.transform(data1=data1, data2=data2) + pseudo_scores1, pseudo_scores2 = model.transform(X=data1, Y=data2) assert np.allclose( scores1, pseudo_scores1, atol=1e-4 ), "Transformed data does not match the scores" @@ -591,37 +595,31 @@ def test_cmca_transform(dim, use_coslat, mock_data_array): model.fit(data1, data2, dim=dim) scores1, scores2 = model.scores() with pytest.raises(NotImplementedError): - pseudo_scores1, pseudo_scores2 = model.transform(data1=data1, data2=data2) + pseudo_scores1, pseudo_scores2 = model.transform(X=data1, Y=data2) # Rotated MCA @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", + "dim, use_coslat, power", [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), ], ) -def test_rmca_transform(dim, use_coslat, power, squared_loadings, mock_data_array): +def test_rmca_transform(dim, use_coslat, power, mock_data_array): """Transforming the original data results in the model scores""" - data1 = mock_data_array.copy() - data2 = data1.copy() ** 2 + X = mock_data_array.copy() + Y = X.copy() ** 2 model = MCA(n_modes=5, standardize=True, use_coslat=use_coslat) - model.fit(data1, data2, dim=dim) - rot = MCARotator(n_modes=5, power=power, squared_loadings=squared_loadings) + model.fit(X, Y, dim=dim) + rot = MCARotator(n_modes=5, power=power) rot.fit(model) scores1, scores2 = rot.scores() - pseudo_scores1, pseudo_scores2 = rot.transform(data1=data1, data2=data2) + pseudo_scores1, pseudo_scores2 = rot.transform(X=X, Y=Y) assert np.allclose( scores1, pseudo_scores1, atol=1e-5 ), "Transformed data does not match the scores" @@ -632,33 +630,27 @@ def test_rmca_transform(dim, use_coslat, power, squared_loadings, mock_data_arra # Hilbert Rotated MCA @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", + "dim, use_coslat, power", [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), ], ) -def test_crmca_transform(dim, use_coslat, power, squared_loadings, mock_data_array): +def test_crmca_transform(dim, use_coslat, power, mock_data_array): """Transforming the original data results in the model scores""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 model = HilbertMCA(n_modes=5, standardize=True, use_coslat=use_coslat) model.fit(data1, data2, dim=dim) - rot = HilbertMCARotator(n_modes=5, power=power, squared_loadings=squared_loadings) + rot = HilbertMCARotator(n_modes=5, power=power) rot.fit(model) scores1, scores2 = rot.scores() with pytest.raises(NotImplementedError): - pseudo_scores1, pseudo_scores2 = rot.transform(data1=data1, data2=data2) + pseudo_scores1, pseudo_scores2 = rot.transform(X=data1, Y=data2) # Reconstruct @@ -806,7 +798,7 @@ def test_mca_inverse_transform(dim, use_coslat, mock_data_array): """Inverse transform produces an approximate reconstruction of the original data""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = MCA(n_modes=19, standardize=True, use_coslat=use_coslat) + model = MCA(n_modes=19, standardize=True, use_coslat=use_coslat, n_pca_modes="all") model.fit(data1, data2, dim=dim) scores1 = model.data["scores1"] scores2 = model.data["scores2"] @@ -837,7 +829,9 @@ def test_cmca_inverse_transform(dim, use_coslat, mock_data_array): """Inverse transform produces an approximate reconstruction of the original data""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = HilbertMCA(n_modes=19, standardize=True, use_coslat=use_coslat) + model = HilbertMCA( + n_modes=19, standardize=True, use_coslat=use_coslat, n_pca_modes="all" + ) model.fit(data1, data2, dim=dim) scores1 = model.data["scores1"] scores2 = model.data["scores2"] @@ -857,31 +851,23 @@ def test_cmca_inverse_transform(dim, use_coslat, mock_data_array): # Rotated MCA @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", - [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), - ], -) -def test_rmca_inverse_transform( - dim, use_coslat, power, squared_loadings, mock_data_array -): + "dim, use_coslat, power", + [ + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), + ], +) +def test_rmca_inverse_transform(dim, use_coslat, power, mock_data_array): """Inverse transform produces an approximate reconstruction of the original data""" data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = MCA(n_modes=10, standardize=True, use_coslat=use_coslat) + model = MCA(n_modes=15, standardize=True, use_coslat=use_coslat, n_pca_modes="all") model.fit(data1, data2, dim=dim) - rot = MCARotator(n_modes=10, power=power, squared_loadings=squared_loadings) + rot = MCARotator(n_modes=15, power=power) rot.fit(model) scores1 = rot.data["scores1"] scores2 = rot.data["scores2"] @@ -890,36 +876,28 @@ def test_rmca_inverse_transform( r2_2 = r2_score(data2, data2_rec, dim=dim) r2_1 = r2_1.mean() r2_2 = r2_2.mean() - # Choose a threshold of 0.90; a bit arbitrary + # Choose a threshold of 0.95; a bit arbitrary assert ( - r2_1 > 0.75 + r2_1 > 0.95 ), f"Inverse transform does not produce a good reconstruction of left field (R2={r2_1.values:.2f})" assert ( - r2_2 > 0.75 + r2_2 > 0.95 ), f"Inverse transform does not produce a good reconstruction of right field (R2={r2_2.values:.2f})" # Hilbert Rotated MCA @pytest.mark.parametrize( - "dim, use_coslat, power, squared_loadings", - [ - (("time",), True, 1, False), - (("lat", "lon"), False, 1, False), - (("lon", "lat"), False, 1, False), - (("time",), True, 2, False), - (("lat", "lon"), False, 2, False), - (("lon", "lat"), False, 2, False), - (("time",), True, 1, True), - (("lat", "lon"), False, 1, True), - (("lon", "lat"), False, 1, True), - (("time",), True, 2, True), - (("lat", "lon"), False, 2, True), - (("lon", "lat"), False, 2, True), - ], -) -def test_crmca_inverse_transform( - dim, use_coslat, power, squared_loadings, mock_data_array -): + "dim, use_coslat, power", + [ + (("time",), True, 1), + (("lat", "lon"), False, 1), + (("lon", "lat"), False, 1), + (("time",), True, 2), + (("lat", "lon"), False, 2), + (("lon", "lat"), False, 2), + ], +) +def test_crmca_inverse_transform(dim, use_coslat, power, mock_data_array): """Inverse transform produces an approximate reconstruction of the original data""" # NOTE: The lobpcg SVD solver for Hilbert matrices requires a small number of modes # compared to the actual data size. Since we have a small test set here we only use @@ -927,9 +905,11 @@ def test_crmca_inverse_transform( # the other tests. data1 = mock_data_array.copy() data2 = data1.copy() ** 2 - model = HilbertMCA(n_modes=10, standardize=True, use_coslat=use_coslat) + model = HilbertMCA( + n_modes=10, standardize=True, use_coslat=use_coslat, use_pca=False + ) model.fit(data1, data2, dim=dim) - rot = HilbertMCARotator(n_modes=10, power=power, squared_loadings=squared_loadings) + rot = HilbertMCARotator(n_modes=10, power=power) rot.fit(model) scores1 = rot.data["scores1"] scores2 = rot.data["scores2"] @@ -938,10 +918,10 @@ def test_crmca_inverse_transform( r2_2 = r2_score(data2, data2_rec, dim=dim) r2_1 = r2_1.mean() r2_2 = r2_2.mean() - # Choose a threshold of 0.80; a bit arbitrary + # Choose a threshold of 0.95; a bit arbitrary assert ( - r2_1 > 0.80 + r2_1 > 0.95 ), f"Inverse transform does not produce a good reconstruction of left field (R2={r2_1.values:.2f})" assert ( - r2_2 > 0.80 + r2_2 > 0.95 ), f"Inverse transform does not produce a good reconstruction of right field (R2={r2_2.values:.2f})" diff --git a/tests/preprocessing/test_pca_preprocessor_dataarray.py b/tests/preprocessing/test_pca_preprocessor_dataarray.py deleted file mode 100644 index 9db17c7..0000000 --- a/tests/preprocessing/test_pca_preprocessor_dataarray.py +++ /dev/null @@ -1,181 +0,0 @@ -import pytest -import numpy as np - -from xeofs.preprocessing.preprocessor import Preprocessor -from ..conftest import generate_synthetic_dataarray -from ..utilities import ( - get_dims_from_data, - data_is_dask, - data_has_multiindex, - assert_expected_dims, - assert_expected_coords, -) - -# ============================================================================= -# GENERALLY VALID TEST CASES -# ============================================================================= -N_SAMPLE_DIMS = [1, 2] -N_FEATURE_DIMS = [1, 2] -INDEX_POLICY = ["index", "multiindex"] -NAN_POLICY = ["no_nan", "fulldim"] -DASK_POLICY = ["no_dask", "dask"] -SEED = [0] - -TEST_DATA_PARAMS = [ - (ns, nf, index, nan, dask) - for ns in N_SAMPLE_DIMS - for nf in N_FEATURE_DIMS - for index in INDEX_POLICY - for nan in NAN_POLICY - for dask in DASK_POLICY -] - -SAMPLE_DIM_NAMES = ["sample", "sample_alternative"] -FEATURE_DIM_NAMES = ["feature", "feature_alternative"] - -VALID_TEST_CASES = [ - (sample_name, feature_name, data_params) - for sample_name in SAMPLE_DIM_NAMES - for feature_name in FEATURE_DIM_NAMES - for data_params in TEST_DATA_PARAMS -] - - -# TESTS -# ============================================================================= -@pytest.mark.parametrize( - "with_std, with_coslat, with_weights", - [ - (True, True, True), - (True, True, False), - (True, False, True), - (True, False, False), - (False, True, True), - (False, True, False), - (False, False, True), - (False, False, False), - ], -) -def test_fit_transform_scalings(with_std, with_coslat, with_weights, mock_data_array): - """fit method should not be implemented.""" - prep = Preprocessor(with_std=with_std, with_coslat=with_coslat) - - weights = None - if with_weights: - weights = mock_data_array.mean("time").copy() - weights[:] = 0.5 - - data_trans = prep.fit_transform( - mock_data_array, - weights=weights, - sample_dims=("time",), - ) - - assert hasattr(prep, "scaler") - assert hasattr(prep, "preconverter") - assert hasattr(prep, "stacker") - assert hasattr(prep, "postconverter") - assert hasattr(prep, "sanitizer") - - # Transformed data is centered - assert np.isclose(data_trans.mean("sample"), 0).all() - # Transformed data is standardized - if with_std and not with_coslat: - if with_weights: - assert np.isclose(data_trans.std("sample"), 0.5).all() - else: - assert np.isclose(data_trans.std("sample"), 1).all() - - -@pytest.mark.parametrize( - "index_policy, nan_policy, dask_policy", - [ - ("index", "no_nan", "no_dask"), - ("multiindex", "no_nan", "dask"), - ("index", "fulldim", "no_dask"), - ("multiindex", "fulldim", "dask"), - ], -) -def test_fit_transform_same_dim_names(index_policy, nan_policy, dask_policy): - data = generate_synthetic_dataarray(1, 1, index_policy, nan_policy, dask_policy) - all_dims, sample_dims, feature_dims = get_dims_from_data(data) - - prep = Preprocessor(sample_name="sample0", feature_name="feature0") - transformed = prep.fit_transform(data, sample_dims) - reconstructed = prep.inverse_transform_data(transformed) - - data_is_dask_before = data_is_dask(data) - data_is_dask_interm = data_is_dask(transformed) - data_is_dask_after = data_is_dask(reconstructed) - - assert set(transformed.dims) == set(("sample0", "feature0")) - assert set(reconstructed.dims) == set(("sample0", "feature0")) - assert not data_has_multiindex(transformed) - assert transformed.notnull().all() - assert data_is_dask_before == data_is_dask_interm - assert data_is_dask_before == data_is_dask_after - - -@pytest.mark.parametrize( - "sample_name, feature_name, data_params", - VALID_TEST_CASES, -) -def test_fit_transform(sample_name, feature_name, data_params): - data = generate_synthetic_dataarray(*data_params) - all_dims, sample_dims, feature_dims = get_dims_from_data(data) - - prep = Preprocessor(sample_name=sample_name, feature_name=feature_name) - transformed = prep.fit_transform(data, sample_dims) - - data_is_dask_before = data_is_dask(data) - data_is_dask_after = data_is_dask(transformed) - - assert transformed.dims == (sample_name, feature_name) - assert not data_has_multiindex(transformed) - assert transformed.notnull().all() - assert data_is_dask_before == data_is_dask_after - - -@pytest.mark.parametrize( - "sample_name, feature_name, data_params", - VALID_TEST_CASES, -) -def test_inverse_transform(sample_name, feature_name, data_params): - data = generate_synthetic_dataarray(*data_params) - all_dims, sample_dims, feature_dims = get_dims_from_data(data) - - prep = Preprocessor(sample_name=sample_name, feature_name=feature_name) - transformed = prep.fit_transform(data, sample_dims) - components = transformed.rename({sample_name: "mode"}) - scores = transformed.rename({feature_name: "mode"}) - - reconstructed = prep.inverse_transform_data(transformed) - components = prep.inverse_transform_components(components) - scores = prep.inverse_transform_scores(scores) - - # Reconstructed data has the same dimensions as the original data - assert_expected_dims(data, reconstructed, policy="all") - assert_expected_dims(data, components, policy="feature") - assert_expected_dims(data, scores, policy="sample") - - # Reconstructed data has the same coordinates as the original data - assert_expected_coords(data, reconstructed, policy="all") - assert_expected_coords(data, components, policy="feature") - assert_expected_coords(data, scores, policy="sample") - - # Reconstructed data and original data have NaNs in the same FEATURES - # Note: NaNs in the same place is not guaranteed, since isolated NaNs will be propagated - # to all samples in the same feature - features_with_nans_before = data.isnull().any(sample_dims) - features_with_nans_after = reconstructed.isnull().any(sample_dims) - assert features_with_nans_before.equals(features_with_nans_after) - - # Reconstructed data has MultiIndex if and only if original data has MultiIndex - data_has_multiindex_before = data_has_multiindex(data) - data_has_multiindex_after = data_has_multiindex(reconstructed) - assert data_has_multiindex_before == data_has_multiindex_after - - # Reconstructed data is dask if and only if original data is dask - data_is_dask_before = data_is_dask(data) - data_is_dask_after = data_is_dask(reconstructed) - assert data_is_dask_before == data_is_dask_after diff --git a/xeofs/models/__init__.py b/xeofs/models/__init__.py index 1ba6a61..c05aef1 100644 --- a/xeofs/models/__init__.py +++ b/xeofs/models/__init__.py @@ -1,6 +1,8 @@ import warnings from .cca import CCA +from .cpcca import CPCCA, ComplexCPCCA, HilbertCPCCA +from .cpcca_rotator import ComplexCPCCARotator, CPCCARotator, HilbertCPCCARotator from .eeof import ExtendedEOF from .eof import EOF, ComplexEOF, HilbertEOF from .eof_rotator import ComplexEOFRotator, EOFRotator, HilbertEOFRotator @@ -13,23 +15,29 @@ __all__ = [ "EOF", + "ExtendedEOF", + "SparsePCA", + "OPA", + "GWPCA", "ComplexEOF", + "ComplexMCA", + "ComplexCPCCA", "HilbertEOF", - "ExtendedEOF", + "HilbertMCA", + "HilbertCPCCA", "EOFRotator", "ComplexEOFRotator", "HilbertEOFRotator", - "OPA", - "GWPCA", "MCA", - "ComplexMCA", - "HilbertMCA", + "CCA", + "CPCCA", "MCARotator", + "CPCCARotator", "ComplexMCARotator", + "ComplexCPCCARotator", "HilbertMCARotator", - "CCA", + "HilbertCPCCARotator", "RotatorFactory", - "SparsePCA", ] diff --git a/xeofs/models/_base_cross_model.py b/xeofs/models/_base_cross_model.py deleted file mode 100644 index 79eb7ff..0000000 --- a/xeofs/models/_base_cross_model.py +++ /dev/null @@ -1,497 +0,0 @@ -import warnings -from abc import ABC, abstractmethod -from datetime import datetime -from typing import Dict, Hashable, Literal, Optional, Sequence, Tuple - -import dask -import xarray as xr -from dask.diagnostics.progress import ProgressBar -from typing_extensions import Self - -try: - from xarray.core.datatree import DataTree -except ImportError: - from datatree import DataTree - -from .._version import __version__ -from ..data_container import DataContainer -from ..preprocessing.preprocessor import Preprocessor -from ..utils.data_types import DataArray, DataObject -from ..utils.io import insert_placeholders, open_model_tree, write_model_tree -from ..utils.sanity_checks import validate_input_type -from ..utils.xarray_utils import convert_to_dim_type, data_is_dask -from .eof import EOF - - -class _BaseCrossModel(ABC): - """ - Abstract base class for cross-decomposition models. - - Parameters: - ------------- - n_modes: int, default=10 - Number of modes to calculate. - center: bool, default=True - Whether to center the input data. - standardize: bool, default=False - Whether to standardize the input data. - use_coslat: bool, default=False - Whether to use cosine of latitude for scaling. - check_nans : bool, default=True - If True, remove full-dimensional NaN features from the data, check to ensure - that NaN features match the original fit data during transform, and check - for isolated NaNs. Note: this forces eager computation of dask arrays. - If False, skip all NaN checks. In this case, NaNs should be explicitly removed - or filled prior to fitting, or SVD will fail. - n_pca_modes: int, default=None - Number of PCA modes to calculate. - compute: bool, default=True - Whether to compute elements of the model eagerly, or to defer computation. - If True, four pieces of the fit will be computed sequentially: 1) the - preprocessor scaler, 2) optional NaN checks, 3) SVD decomposition, 4) scores - and components. - sample_name: str, default="sample" - Name of the new sample dimension. - feature_name: str, default="feature" - Name of the new feature dimension. - solver: {"auto", "full", "randomized"}, default="auto" - Solver to use for the SVD computation. - solver_kwargs: dict, default={} - Additional keyword arguments to passed to the SVD solver function. - - """ - - def __init__( - self, - n_modes=10, - center=True, - standardize=False, - use_coslat=False, - check_nans=True, - n_pca_modes=None, - compute=True, - verbose=False, - sample_name="sample", - feature_name="feature", - solver="auto", - random_state=None, - solver_kwargs={}, - ): - if verbose: - warnings.warn( - "The 'verbose' parameter is deprecated and will be removed in a future release.", - category=DeprecationWarning, - stacklevel=3, - ) - self.n_modes = n_modes - self.sample_name = sample_name - self.feature_name = feature_name - - # Define model parameters - self._params = { - "n_modes": n_modes, - "center": center, - "standardize": standardize, - "use_coslat": use_coslat, - "check_nans": check_nans, - "n_pca_modes": n_pca_modes, - "compute": compute, - "sample_name": sample_name, - "feature_name": feature_name, - "solver": solver, - "random_state": random_state, - "solver_kwargs": solver_kwargs, - } - - self._decomposer_kwargs = { - "n_modes": n_modes, - "solver": solver, - "random_state": random_state, - "compute": compute, - "verbose": verbose, - "solver_kwargs": solver_kwargs, - } - self._preprocessor_kwargs = { - "sample_name": sample_name, - "feature_name": feature_name, - "with_center": center, - "with_std": standardize, - "with_coslat": use_coslat, - "check_nans": check_nans, - "compute": compute, - } - - # Define analysis-relevant meta data - self.attrs = {"model": "BaseCrossModel"} - self.attrs.update( - { - "software": "xeofs", - "version": __version__, - "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - ) - self.attrs.update(self._params) - - # Initialize preprocessors to scale and stack left (1) and right (2) data - self.preprocessor1 = Preprocessor(**self._preprocessor_kwargs) - self.preprocessor2 = Preprocessor(**self._preprocessor_kwargs) - - # Initialize the data container that stores the results - self.data = DataContainer() - - # Initialize PCA objects - self.pca1 = ( - EOF(n_modes=n_pca_modes, compute=self._params["compute"], check_nans=False) - if n_pca_modes - else None - ) - self.pca2 = ( - EOF(n_modes=n_pca_modes, compute=self._params["compute"], check_nans=False) - if n_pca_modes - else None - ) - - def get_serialization_attrs(self) -> Dict: - return dict( - data=self.data, - preprocessor1=self.preprocessor1, - preprocessor2=self.preprocessor2, - ) - - def fit( - self, - data1: DataObject, - data2: DataObject, - dim: Hashable | Sequence[Hashable], - weights1: Optional[DataObject] = None, - weights2: Optional[DataObject] = None, - ) -> Self: - """ - Fit the model to the data. - - Parameters - ---------- - data1: DataArray | Dataset | List[DataArray] - Left input data. - data2: DataArray | Dataset | List[DataArray] - Right input data. - dim: Hashable | Sequence[Hashable] - Define the sample dimensions. The remaining dimensions - will be treated as feature dimensions. - weights1: Optional[DataObject] - Weights to be applied to the left input data. - weights2: Optional[DataObject] - Weights to be applied to the right input data. - - """ - validate_input_type(data1) - validate_input_type(data2) - if weights1 is not None: - validate_input_type(weights1) - if weights2 is not None: - validate_input_type(weights2) - - self.sample_dims = convert_to_dim_type(dim) - # Preprocess data1 - data1 = self.preprocessor1.fit_transform(data1, self.sample_dims, weights1) - # Preprocess data2 - data2 = self.preprocessor2.fit_transform(data2, self.sample_dims, weights2) - - self._fit_algorithm(data1, data2) - - if self._params["compute"]: - self.data.compute() - - return self - - def transform( - self, data1: Optional[DataObject] = None, data2: Optional[DataObject] = None - ) -> Sequence[DataArray]: - """ - Abstract method to transform the data. - - - """ - if data1 is None and data2 is None: - raise ValueError("Either data1 or data2 must be provided.") - - if data1 is not None: - validate_input_type(data1) - # Preprocess data1 - data1 = self.preprocessor1.transform(data1) - if data2 is not None: - validate_input_type(data2) - # Preprocess data2 - data2 = self.preprocessor2.transform(data2) - - data = self._transform_algorithm(data1, data2) - data_list = [] - if data1 is not None: - data1 = self.preprocessor1.inverse_transform_scores_unseen(data["data1"]) - data_list.append(data1) - if data2 is not None: - data2 = self.preprocessor2.inverse_transform_scores_unseen(data["data2"]) - data_list.append(data2) - return data_list - - def inverse_transform( - self, scores1: DataArray, scores2: DataArray - ) -> Tuple[DataObject, DataObject]: - """Reconstruct the original data from transformed data. - - Parameters - ---------- - scores1: DataObject - Transformed left field data to be reconstructed. This could be - a subset of the `scores` data of a fitted model, or unseen data. - Must have a 'mode' dimension. - scores2: DataObject - Transformed right field data to be reconstructed. This could be - a subset of the `scores` data of a fitted model, or unseen data. - Must have a 'mode' dimension. - - Returns - ------- - Xrec1: DataArray | Dataset | List[DataArray] - Reconstructed data of left field. - Xrec2: DataArray | Dataset | List[DataArray] - Reconstructed data of right field. - - """ - # Handle scalar mode in xr.dot - if "mode" not in scores1.dims: - scores1 = scores1.expand_dims("mode") - if "mode" not in scores2.dims: - scores2 = scores2.expand_dims("mode") - - data1, data2 = self._inverse_transform_algorithm(scores1, scores2) - - # Unstack and rescale the data - data1 = self.preprocessor1.inverse_transform_data(data1) - data2 = self.preprocessor2.inverse_transform_data(data2) - - return data1, data2 - - @abstractmethod - def _fit_algorithm(self, data1: DataArray, data2: DataArray) -> Self: - """ - Fit the model to the preprocessed data. This method needs to be implemented in the respective - subclass. - - Parameters - ---------- - data1, data2: DataArray - Preprocessed input data of two dimensions: (`sample_name`, `feature_name`) - - """ - raise NotImplementedError - - @abstractmethod - def _transform_algorithm( - self, data1: Optional[DataArray] = None, data2: Optional[DataArray] = None - ) -> Dict[str, DataArray]: - """ - Transform the preprocessed data. This method needs to be implemented in the respective - subclass. - - Parameters - ---------- - data1, data2: DataArray - Preprocessed input data of two dimensions: (`sample_name`, `feature_name`) - - """ - raise NotImplementedError - - @abstractmethod - def _inverse_transform_algorithm( - self, scores1: DataArray, scores2: DataArray - ) -> Tuple[DataArray, DataArray]: - """ - Reconstruct the original data from transformed data. This method needs to be implemented in the respective - subclass. - - Parameters - ---------- - scores1: DataArray - Transformed left field data to be reconstructed. This could be - a subset of the `scores` data of a fitted model, or unseen data. - Must have a 'mode' dimension. - scores2: DataArray - Transformed right field data to be reconstructed. This could be - a subset of the `scores` data of a fitted model, or unseen data. - Must have a 'mode' dimension. - - Returns - ------- - Xrec1: DataArray - Reconstructed data of left field. - Xrec2: DataArray - Reconstructed data of right field. - - """ - raise NotImplementedError - - def components(self) -> Tuple[DataObject, DataObject]: - """Get the components.""" - comps1 = self.data["components1"] - comps2 = self.data["components2"] - - components1: DataObject = self.preprocessor1.inverse_transform_components( - comps1 - ) - components2: DataObject = self.preprocessor2.inverse_transform_components( - comps2 - ) - return components1, components2 - - def scores(self) -> Tuple[DataArray, DataArray]: - """Get the scores.""" - scores1 = self.data["scores1"] - scores2 = self.data["scores2"] - - scores1: DataArray = self.preprocessor1.inverse_transform_scores(scores1) - scores2: DataArray = self.preprocessor2.inverse_transform_scores(scores2) - return scores1, scores2 - - def compute(self, verbose: bool = False, **kwargs): - """Compute and load delayed model results. - - Parameters - ---------- - verbose : bool - Whether or not to provide additional information about the computing progress. - **kwargs - Additional keyword arguments to pass to `dask.compute()`. - """ - # find and compute all dask arrays simultaneously to allow dask to optimize the - # shared graph and avoid duplicate i/o and computations - dt = self.serialize() - - data_objs = { - k: v - for k, v in dt.to_dict().items() - if data_is_dask(v) and v.attrs.get("allow_compute", True) - } - - if verbose: - with ProgressBar(): - (data_objs,) = dask.compute(data_objs, **kwargs) - else: - (data_objs,) = dask.compute(data_objs, **kwargs) - - for k, v in data_objs.items(): - dt[k] = DataTree(v) - - # then rebuild the trained model from the computed results - self._deserialize_attrs(dt) - - self._post_compute() - - def _post_compute(self): - pass - - def get_params(self) -> Dict: - """Get the model parameters.""" - return self._params - - def serialize(self) -> DataTree: - """Serialize a complete model with its preprocessors.""" - # Create a root node for this object with its params as attrs - ds_root = xr.Dataset(attrs=dict(params=self.get_params())) - dt = DataTree(data=ds_root, name=type(self).__name__) - - # Retrieve the tree representation of each attached object, or set basic attrs - for key, attr in self.get_serialization_attrs().items(): - if hasattr(attr, "serialize"): - dt[key] = attr.serialize() - dt.attrs[key] = "_is_tree" - else: - dt.attrs[key] = attr - - return dt - - def save( - self, - path: str, - overwrite: bool = False, - save_data: bool = False, - engine: Literal["zarr", "netcdf4", "h5netcdf"] = "zarr", - **kwargs, - ): - """Save the model. - - Parameters - ---------- - path : str - Path to save the model. - overwrite: bool, default=False - Whether or not to overwrite the existing path if it already exists. - Ignored unless `engine="zarr"`. - save_data : str - Whether or not to save the full input data along with the fitted components. - engine : {"zarr", "netcdf4", "h5netcdf"}, default="zarr" - Xarray backend engine to use for writing the saved model. - **kwargs - Additional keyword arguments to pass to `DataTree.to_netcdf()` or `DataTree.to_zarr()`. - - """ - self.compute() - - dt = self.serialize() - - # Remove any raw data arrays at this stage - if not save_data: - dt = insert_placeholders(dt) - - write_model_tree(dt, path, overwrite=overwrite, engine=engine, **kwargs) - - @classmethod - def deserialize(cls, dt: DataTree) -> Self: - """Deserialize the model and its preprocessors from a DataTree.""" - # Recreate the model with parameters set by root level attrs - model = cls(**dt.attrs["params"]) - model._deserialize_attrs(dt) - return model - - def _deserialize_attrs(self, dt: DataTree): - """Set the necessary attributes of the model from a DataTree.""" - for key, attr in dt.attrs.items(): - if key == "params": - continue - elif attr == "_is_tree": - deserialized_obj = getattr(self, key).deserialize(dt[key]) - else: - deserialized_obj = attr - setattr(self, key, deserialized_obj) - - @classmethod - def load( - cls, - path: str, - engine: Literal["zarr", "netcdf4", "h5netcdf"] = "zarr", - **kwargs, - ) -> Self: - """Load a saved model. - - Parameters - ---------- - path : str - Path to the saved model. - engine : {"zarr", "netcdf4", "h5netcdf"}, default="zarr" - Xarray backend engine to use for reading the saved model. - **kwargs - Additional keyword arguments to pass to `open_datatree()`. - - Returns - ------- - model : _BaseCrossModel - The loaded model. - - """ - dt = open_model_tree(path, engine=engine, **kwargs) - model = cls.deserialize(dt) - return model - - def _validate_loaded_data(self, data: DataArray): - """Optionally check the loaded data for placeholders.""" - pass diff --git a/xeofs/models/_base_model.py b/xeofs/models/_base_model.py index ff4e78b..c7d3fdb 100644 --- a/xeofs/models/_base_model.py +++ b/xeofs/models/_base_model.py @@ -1,37 +1,25 @@ import warnings from abc import ABC, abstractmethod from datetime import datetime -from typing import ( - Any, - Dict, - Hashable, - List, - Literal, - Optional, - Sequence, -) +from typing import Any, Literal -import dask +import dask.base import xarray as xr from dask.diagnostics.progress import ProgressBar from typing_extensions import Self -try: - from xarray.core.datatree import DataTree -except ImportError: - from datatree import DataTree - from .._version import __version__ -from ..data_container import DataContainer -from ..preprocessing.preprocessor import Preprocessor -from ..utils.data_types import Data, DataArray, DataObject +from ..utils.data_types import DataArray from ..utils.io import insert_placeholders, open_model_tree, write_model_tree -from ..utils.sanity_checks import validate_input_type from ..utils.xarray_utils import ( - convert_to_dim_type, data_is_dask, ) +try: + from xarray.core.datatree import DataTree # type: ignore +except ImportError: + from datatree import DataTree + # Ignore warnings from numpy casting with additional coordinates warnings.filterwarnings("ignore", message=r"^invalid value encountered in cast*") @@ -40,92 +28,15 @@ class _BaseModel(ABC): """ - Abstract base class for EOF model. - - Parameters - ---------- - n_modes: int, default=10 - Number of modes to calculate. - center: bool, default=True - Whether to center the input data. - standardize: bool, default=False - Whether to standardize the input data. - use_coslat: bool, default=False - Whether to use cosine of latitude for scaling. - check_nans : bool, default=True - If True, remove full-dimensional NaN features from the data, check to ensure - that NaN features match the original fit data during transform, and check - for isolated NaNs. Note: this forces eager computation of dask arrays. - If False, skip all NaN checks. In this case, NaNs should be explicitly removed - or filled prior to fitting, or SVD will fail. - sample_name: str, default="sample" - Name of the sample dimension. - feature_name: str, default="feature" - Name of the feature dimension. - compute : bool, default=True - Whether to compute elements of the model eagerly, or to defer computation. - If True, four pieces of the fit will be computed sequentially: 1) the - preprocessor scaler, 2) optional NaN checks, 3) SVD decomposition, 4) scores - and components. - verbose: bool, default=False - Whether to show a progress bar when computing the decomposition. - random_state: Optional[int], default=None - Seed for the random number generator. - solver: {"auto", "full", "randomized"}, default="auto" - Solver to use for the SVD computation. - solver_kwargs: dict, default={} - Additional keyword arguments to pass to the SVD solver function. + Abstract base class for an xeofs model. - """ + Provides basic functionality for lazy model evaluation, serialization, deserialization and saving/loading models. - def __init__( - self, - n_modes=10, - center=True, - standardize=False, - use_coslat=False, - check_nans=True, - sample_name="sample", - feature_name="feature", - compute=True, - verbose=False, - random_state=None, - solver="auto", - solver_kwargs={}, - ): - if verbose: - warnings.warn( - "The 'verbose' parameter is deprecated and will be removed in a future release.", - category=DeprecationWarning, - stacklevel=3, - ) - self.n_modes = n_modes - self.sample_name = sample_name - self.feature_name = feature_name + """ + def __init__(self): # Define model parameters - self._params = { - "n_modes": n_modes, - "center": center, - "standardize": standardize, - "use_coslat": use_coslat, - "check_nans": check_nans, - "sample_name": sample_name, - "feature_name": feature_name, - "random_state": random_state, - "verbose": verbose, - "compute": compute, - "solver": solver, - "solver_kwargs": solver_kwargs, - } - self._decomposer_kwargs = { - "n_modes": n_modes, - "solver": solver, - "random_state": random_state, - "compute": compute, - "verbose": verbose, - "solver_kwargs": solver_kwargs, - } + self._params = {} # Define analysis-relevant meta data self.attrs = {"model": "BaseModel"} @@ -138,229 +49,11 @@ def __init__( ) self.attrs.update(self._params) - # Initialize the Preprocessor to scale and stack the data - self.preprocessor = Preprocessor( - sample_name=sample_name, - feature_name=feature_name, - with_center=center, - with_std=standardize, - with_coslat=use_coslat, - check_nans=check_nans, - compute=compute, - ) - # Initialize the data container that stores the results - self.data = DataContainer() - - def get_serialization_attrs(self) -> Dict: - return dict( - data=self.data, - preprocessor=self.preprocessor, - ) - - def fit( - self, - X: List[Data] | Data, - dim: Sequence[Hashable] | Hashable, - weights: Optional[List[Data] | Data] = None, - ) -> Self: - """ - Fit the model to the input data. - - Parameters - ---------- - X: DataArray | Dataset | List[DataArray] - Input data. - dim: Sequence[Hashable] | Hashable - Specify the sample dimensions. The remaining dimensions - will be treated as feature dimensions. - weights: Optional[DataArray | Dataset | List[DataArray]] - Weighting factors for the input data. - - """ - # Check for invalid types - validate_input_type(X) - if weights is not None: - validate_input_type(weights) - - self.sample_dims = convert_to_dim_type(dim) - - # Preprocess the data & transform to 2D - data2D: DataArray = self.preprocessor.fit_transform( - X, self.sample_dims, weights - ) - - self._fit_algorithm(data2D) - - if self._params["compute"]: - self.data.compute() - - return self - - @abstractmethod - def _fit_algorithm(self, data: DataArray) -> Self: - """Fit the model to the input data assuming a 2D DataArray. - - Parameters - ---------- - data: DataArray - Input data with dimensions (sample_name, feature_name) - - Returns - ------- - self: Self - The fitted model. - - """ - raise NotImplementedError - - def transform(self, data: List[Data] | Data, normalized=True) -> DataArray: - """Project data onto the components. - - Parameters - ---------- - data: DataArray | Dataset | List[DataArray] - Data to be transformed. - normalized: bool, default=True - Whether to normalize the scores by the L2 norm. - - Returns - ------- - projections: DataArray - Projections of the data onto the components. - - """ - validate_input_type(data) - - data2D = self.preprocessor.transform(data) - data2D = self._transform_algorithm(data2D) - if normalized: - data2D = data2D / self.data["norms"] - data2D.name = "scores" - return self.preprocessor.inverse_transform_scores_unseen(data2D) - @abstractmethod - def _transform_algorithm(self, data: DataArray) -> DataArray: - """Project data onto the components. - - Parameters - ---------- - data: DataArray - Input data with dimensions (sample_name, feature_name) - - Returns - ------- - projections: DataArray - Projections of the data onto the components. - - """ + def get_serialization_attrs(self) -> dict: + """Get the attributes to serialize.""" raise NotImplementedError - def fit_transform( - self, - data: List[Data] | Data, - dim: Sequence[Hashable] | Hashable, - weights: Optional[List[Data] | Data] = None, - **kwargs, - ) -> DataArray: - """Fit the model to the input data and project the data onto the components. - - Parameters - ---------- - data: DataObject - Input data. - dim: Sequence[Hashable] | Hashable - Specify the sample dimensions. The remaining dimensions - will be treated as feature dimensions. - weights: Optional[DataObject] - Weighting factors for the input data. - **kwargs - Additional keyword arguments to pass to the transform method. - - Returns - ------- - projections: DataArray - Projections of the data onto the components. - - """ - return self.fit(data, dim, weights).transform(data, **kwargs) - - def inverse_transform( - self, scores: DataArray, normalized: bool = True - ) -> DataObject: - """Reconstruct the original data from transformed data. - - Parameters - ---------- - scores: DataArray - Transformed data to be reconstructed. This could be a subset - of the `scores` data of a fitted model, or unseen data. Must - have a 'mode' dimension. - normalized: bool, default=True - Whether the scores data have been normalized by the L2 norm. - - Returns - ------- - data: DataArray | Dataset | List[DataArray] - Reconstructed data. - - """ - if normalized: - norms = self.data["norms"].sel(mode=scores.mode) - scores = scores * norms - - # Handle scalar mode in xr.dot - if "mode" not in scores.dims: - scores = scores.expand_dims("mode") - - data_reconstructed = self._inverse_transform_algorithm(scores) - - # Reconstructing the data using a single mode introduces a - # redundant "mode" coordinate - if "mode" in data_reconstructed.coords: - data_reconstructed = data_reconstructed.drop_vars("mode") - - return self.preprocessor.inverse_transform_data(data_reconstructed) - - @abstractmethod - def _inverse_transform_algorithm(self, scores: DataObject) -> DataArray: - """Reconstruct the original data from transformed data. - - Parameters - ---------- - scores: DataObject - Transformed data to be reconstructed. This could be a subset - of the `scores` data of a fitted model, or unseen data. Must - have a 'mode' dimension. - - Returns - ------- - data: DataArray - Reconstructed 2D data with dimensions (sample_name, feature_name) - - """ - raise NotImplementedError - - def components(self) -> DataObject: - """Get the components.""" - components = self.data["components"] - return self.preprocessor.inverse_transform_components(components) - - def scores(self, normalized=True) -> DataArray: - """Get the scores. - - Parameters - ---------- - normalized: bool, default=True - Whether to normalize the scores by the L2 norm. - """ - scores = self.data["scores"].copy() - if normalized: - attrs = scores.attrs.copy() - scores = scores / self.data["norms"] - scores.attrs.update(attrs) - scores.name = "scores" - return self.preprocessor.inverse_transform_scores(scores) - def compute(self, verbose: bool = False, **kwargs): """Compute and load delayed model results. @@ -383,9 +76,9 @@ def compute(self, verbose: bool = False, **kwargs): if verbose: with ProgressBar(): - (data_objs,) = dask.compute(data_objs, **kwargs) + (data_objs,) = dask.base.compute(data_objs, **kwargs) else: - (data_objs,) = dask.compute(data_objs, **kwargs) + (data_objs,) = dask.base.compute(data_objs, **kwargs) for k, v in data_objs.items(): dt[k] = DataTree(v) @@ -398,7 +91,7 @@ def compute(self, verbose: bool = False, **kwargs): def _post_compute(self): pass - def get_params(self) -> Dict[str, Any]: + def get_params(self) -> dict[str, Any]: """Get the model parameters.""" return self._params @@ -467,10 +160,10 @@ def _deserialize_attrs(self, dt: DataTree): if key == "params": continue elif attr == "_is_tree": - deserialized_obj = getattr(self, key).deserialize(dt[key]) + deserialized_obj = getattr(self, str(key)).deserialize(dt[str(key)]) else: deserialized_obj = attr - setattr(self, key, deserialized_obj) + setattr(self, str(key), deserialized_obj) @classmethod def load( diff --git a/xeofs/models/_base_model_cross_set.py b/xeofs/models/_base_model_cross_set.py new file mode 100644 index 0000000..0273eab --- /dev/null +++ b/xeofs/models/_base_model_cross_set.py @@ -0,0 +1,552 @@ +import warnings +from abc import abstractmethod +from typing import Any, Hashable, Sequence + +from numpy.random import Generator +from typing_extensions import Self + +from ..data_container import DataContainer +from ..preprocessing.preprocessor import Preprocessor +from ..preprocessing.whitener import Whitener +from ..utils.data_types import DataArray, DataObject, GenericType +from ..utils.sanity_checks import validate_input_type +from ..utils.xarray_utils import convert_to_dim_type +from ._base_model import _BaseModel + + +class _BaseModelCrossSet(_BaseModel): + """ + Abstract base class for cross-decomposition models. + + Parameters: + ------------- + n_modes: int + Number of modes to calculate. + center: bool, default=True + Whether to center the input data. + standardize: bool, default=False + Whether to standardize the input data. + use_coslat: bool, default=False + Whether to use cosine of latitude for scaling. + check_nans : bool, default=True + If True, remove full-dimensional NaN features from the data, check to + ensure that NaN features match the original fit data during transform, + and check for isolated NaNs. Note: this forces eager computation of dask + arrays. If False, skip all NaN checks. In this case, NaNs should be + explicitly removed or filled prior to fitting, or SVD will fail. + alpha : float, default=1.0 + Parameter to perform fractional whitening of the data. If 0, the data is + completely whitened. If 1, the data is not whitened. + use_pca : bool, default=False + If True, perform PCA to reduce the dimensionality of the data. + n_pca_modes : int | float | str, default=0.999 + If int, specifies the number of modes to retain. If float, specifies the + fraction of variance in the (whitened) data that should be explained by + the retained modes. If "all", all modes are retained. + init_rank_reduction : float, default=0.3 + Only relevant when `use_pca=True` and `n_modes` is a float, in which + case it denotes the fraction of the initial rank to reduce the data to + via PCA as a first guess before truncating the solution to the desired + fraction of explained variance. This allows for faster computation of + PCA via randomized SVD and avoids the need to compute the full SVD. + compute: bool, default=True + Whether to compute elements of the model eagerly, or to defer + computation. If True, four pieces of the fit will be computed + sequentially: 1) the preprocessor scaler, 2) optional NaN checks, 3) SVD + decomposition, 4) scores and components. + random_state: numpy.random.Generator or int, optional + Seed for the random number generator. + sample_name: str, default="sample" + Name of the new sample dimension. + feature_name: str, default="feature" + Name of the new feature dimension. + solver: {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs: dict[str, Any], default={} + Additional keyword arguments to passed to the SVD solver function. + + """ + + def __init__( + self, + n_modes: int, + center: Sequence[bool] | bool = True, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + alpha: Sequence[float] | float = 1.0, + solver: Sequence[str] | str = "auto", + compute: bool = True, + verbose: bool = False, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + random_state: Generator | int | None = None, + solver_kwargs: dict[str, Any] = {}, + ): + if verbose: + warnings.warn( + "The 'verbose' parameter is deprecated and will be removed in a future release.", + category=DeprecationWarning, + stacklevel=3, + ) + + super().__init__() + + # Process parameters + center = self._process_parameter("center", center, True) + standardize = self._process_parameter("standardize", standardize, False) + use_coslat = self._process_parameter("use_coslat", use_coslat, False) + check_nans = self._process_parameter("check_nans", check_nans, True) + use_pca = self._process_parameter("use_pca", use_pca, True) + n_pca_modes = self._process_parameter("n_pca_modes", n_pca_modes, 0.999) + pca_init_rank_reduction = self._process_parameter( + "pca_init_rank_reduction", pca_init_rank_reduction, 0.3 + ) + + alpha = self._process_parameter("alpha", alpha, 1.0) + # Ensure that alpha is a float + alpha = [float(a) for a in alpha] + + # Use feature1 and feature2 throughout the model to refer to the two datasets + if isinstance(feature_name, str): + feature_name = [feature_name + str(i + 1) for i in range(2)] + self._check_parameter_number("feature_name", feature_name) + if feature_name[0] == feature_name[1]: + raise ValueError("feature_name must be different for each dataset") + + # Define model parameters + self.sample_name = sample_name + self.feature_name = feature_name + + self._params = { + "n_modes": n_modes, + "center": center, + "standardize": standardize, + "use_coslat": use_coslat, + "check_nans": check_nans, + "use_pca": use_pca, + "n_pca_modes": n_pca_modes, + "pca_init_rank_reduction": pca_init_rank_reduction, + "alpha": alpha, + "sample_name": sample_name, + "feature_name": feature_name, + "random_state": random_state, + "verbose": verbose, + "compute": compute, + "solver": solver, + } + + self._decomposer_kwargs = { + "n_modes": n_modes, + "init_rank_reduction": pca_init_rank_reduction, + "solver": solver, + "random_state": random_state, + "compute": compute, + "verbose": False, + "solver_kwargs": solver_kwargs, + } + + # Define analysis-relevant meta data + self.attrs.update({"model": "BaseModelCrossSet"}) + self.attrs.update(self.get_params()) + + # Initialize preprocessors for dataset X and Y + self.preprocessor1 = Preprocessor( + sample_name=sample_name, + feature_name=feature_name[0], + with_center=center[0], + with_std=standardize[0], + with_coslat=use_coslat[0], + check_nans=check_nans[0], + compute=compute, + ) + + self.preprocessor2 = Preprocessor( + sample_name=sample_name, + feature_name=feature_name[1], + with_center=center[1], + with_std=standardize[1], + with_coslat=use_coslat[1], + check_nans=check_nans[1], + compute=compute, + ) + + self.whitener1 = Whitener( + alpha=alpha[0], + use_pca=use_pca[0], + n_modes=n_pca_modes[0], + init_rank_reduction=pca_init_rank_reduction[0], + sample_name=sample_name, + feature_name=feature_name[0], + ) + self.whitener2 = Whitener( + alpha=alpha[1], + use_pca=use_pca[1], + n_modes=n_pca_modes[1], + init_rank_reduction=pca_init_rank_reduction[1], + sample_name=sample_name, + feature_name=feature_name[1], + ) + + # Initialize the data container that stores the results + self.data = DataContainer() + + @abstractmethod + def _fit_algorithm(self, X: DataArray, Y: DataArray) -> Self: + """ + Fit the model to the preprocessed data. This method needs to be implemented in the respective + subclass. + + Parameters + ---------- + X, Y: DataArray + Preprocessed input data of two dimensions: (`sample_name`, `feature_name`) + + """ + raise NotImplementedError + + @abstractmethod + def _transform_algorithm( + self, X: DataArray | None = None, Y: DataArray | None = None, **kwargs + ) -> dict[str, DataArray]: + """ + Transform the preprocessed data. This method needs to be implemented in the respective + subclass. + + Parameters + ---------- + X, Y: DataArray + Preprocessed input data of two dimensions: (`sample_name`, `feature_name`) + + """ + raise NotImplementedError + + @abstractmethod + def _inverse_transform_algorithm( + self, X: DataArray | None = None, Y: DataArray | None = None, **kwargs + ) -> dict[str, DataArray]: + """ + Reconstruct the original data from transformed data. This method needs to be implemented in the respective + subclass. + + Parameters + ---------- + scores1: DataArray + Transformed left field data to be reconstructed. This could be + a subset of the `scores` data of a fitted model, or unseen data. + Must have a 'mode' dimension. + scores2: DataArray + Transformed right field data to be reconstructed. This could be + a subset of the `scores` data of a fitted model, or unseen data. + Must have a 'mode' dimension. + + Returns + ------- + Xrec1: DataArray + Reconstructed data of left field. + Xrec2: DataArray + Reconstructed data of right field. + + """ + raise NotImplementedError + + @abstractmethod + def _predict_algorithm(self, X: DataArray, **kwargs) -> DataArray: + """Predict the right field from the left field. This method needs to be implemented in the respective subclass.""" + raise NotImplementedError + + @abstractmethod + def _get_components(self, **kwargs) -> tuple[DataArray, DataArray]: + """Get the components.""" + raise NotImplementedError + + @abstractmethod + def _get_scores(self, **kwargs) -> tuple[DataArray, DataArray]: + """Get the scores.""" + raise NotImplementedError + + def fit( + self, + X: DataObject, + Y: DataObject, + dim: Hashable | Sequence[Hashable], + weights_X: DataObject | None = None, + weights_Y: DataObject | None = None, + ) -> Self: + """Fit the data to the model. + + Parameters + ---------- + X, Y: DataObject + Data to be fitted. + dim: Hashable | Sequence[Hashable] + Define the sample dimensions. The remaining dimensions + will be treated as feature dimensions. + weights_X, weights_Y: DataObject | None, default=None + Weights for the data. If None, no weights are used. + + Returns + ------- + xeofs MultiSetModel + Fitted model. + + """ + validate_input_type(X) + validate_input_type(Y) + if weights_X is not None: + validate_input_type(weights_X) + if weights_Y is not None: + validate_input_type(weights_Y) + + self.sample_dims = convert_to_dim_type(dim) + # Preprocess data + X = self.preprocessor1.fit_transform(X, self.sample_dims, weights_X) + Y = self.preprocessor2.fit_transform(Y, self.sample_dims, weights_Y) + # Whiten data + X = self.whitener1.fit_transform(X) + Y = self.whitener2.fit_transform(Y) + # Augment data + X, y = self._augment_data(X, Y) + # Fit the model + self._fit_algorithm(X, Y) + + if self.get_params()["compute"]: + self.data.compute() + + return self + + def transform( + self, X: DataObject | None = None, Y: DataObject | None = None, normalized=False + ) -> Sequence[DataArray] | DataArray: + """Transform the data. + + Parameters + ---------- + X, Y: DataObject | None + Data to be transformed. At least one of them must be provided. + normalized: bool, default=False + Whether to return normalized scores. + + Returns + ------- + Sequence[DataArray] | DataArray + Transformed data. + + + """ + if X is None and Y is None: + raise ValueError("Either X or Y must be provided.") + + if X is not None: + validate_input_type(X) + # Preprocess X + X = self.preprocessor1.transform(X) + X = self.whitener1.transform(X) + if Y is not None: + validate_input_type(Y) + # Preprocess Y + Y = self.preprocessor2.transform(Y) + Y = self.whitener2.transform(Y) + + data = self._transform_algorithm(X, Y, normalized=normalized) + data_list = [] + if X is not None: + X = self.whitener1.inverse_transform_scores_unseen(data["data1"]) + X = self.preprocessor1.inverse_transform_scores_unseen(X) + data_list.append(X) + if Y is not None: + Y = self.whitener2.inverse_transform_scores_unseen(data["data2"]) + Y = self.preprocessor2.inverse_transform_scores_unseen(Y) + data_list.append(Y) + + if len(data_list) == 1: + return data_list[0] + else: + return data_list + + def inverse_transform( + self, X: DataArray | None = None, Y: DataArray | None = None + ) -> Sequence[DataObject] | DataObject: + """Reconstruct the original data from transformed data. + + Parameters + ---------- + X, Y: DataArray | None + Transformed data to be reconstructed. At least one of them must be provided. + + Returns + ------- + Sequence[DataObject] | DataObject + Reconstructed data. + + """ + x_is_given = X is not None + y_is_given = Y is not None + + if (not x_is_given) and (not y_is_given): + raise ValueError("Either X or Y must be provided.") + + if x_is_given: + # Handle scalar mode in xr.dot + if "mode" not in X.dims: + X = X.expand_dims("mode") + + if y_is_given: + # Handle scalar mode in xr.dot + if "mode" not in Y.dims: + Y = Y.expand_dims("mode") + + inv_transformed = self._inverse_transform_algorithm(X, Y) + + results: list[DataObject] = [] + + # Unstack and rescale the data + if x_is_given: + X = inv_transformed["X"] + X = self.whitener1.inverse_transform_data(X) + Xrec = self.preprocessor1.inverse_transform_data(X) + results.append(Xrec) + if y_is_given: + Y = inv_transformed["Y"] + Y = self.whitener2.inverse_transform_data(Y) + Yrec = self.preprocessor2.inverse_transform_data(Y) + results.append(Yrec) + + if len(results) == 1: + return results[0] + else: + return results + + def predict(self, X: DataObject) -> DataArray: + """Predict Y from X. + + Parameters + ---------- + X: DataObject + Data to be used for prediction. + + Returns + ------- + DataArray + Predicted data in transformed space. + + """ + + validate_input_type(X) + + # Preprocess X + X = self.preprocessor1.transform(X) + + # Whiten X + X = self.whitener1.transform(X) + + # Predict Y + Y = self._predict_algorithm(X) + + # Inverse transform Y + Y = self.whitener2.inverse_transform_scores_unseen(Y) + Y = self.preprocessor2.inverse_transform_scores_unseen(Y) + + return Y + + def components(self, normalized=True) -> tuple[DataObject, DataObject]: + """Get the components of the model. + + The components may be referred to differently depending on the model + type. Common terms include canonical vectors, singular vectors, loadings + or spatial patterns. + + Parameters + ---------- + normalized: bool, default=True + Whether to return L2 normalized components. + + Returns + ------- + tuple[DataObject, DataObject] + Components of X and Y. + + """ + Px, Py = self._get_components(normalized=normalized) + + Px = self.whitener1.inverse_transform_components(Px) + Py = self.whitener2.inverse_transform_components(Py) + + Px: DataObject = self.preprocessor1.inverse_transform_components(Px) + Py: DataObject = self.preprocessor2.inverse_transform_components(Py) + return Px, Py + + def scores(self, normalized=False) -> tuple[DataArray, DataArray]: + """Get the scores of the model. + + The component scores may be referred to differently depending on the + model type. Common terms include canonical variates, expansion + coefficents, principal component (scores) or temporal patterns. + + Parameters + ---------- + normalized: bool, default=False + Whether to return L2 normalized scores. + + Returns + ------- + tuple[DataArray, DataArray] + Scores of X and Y. + + """ + Rx, Ry = self._get_scores(normalized=normalized) + + Rx = self.whitener1.inverse_transform_scores(Rx) + Ry = self.whitener2.inverse_transform_scores(Ry) + + Rx: DataArray = self.preprocessor1.inverse_transform_scores(Rx) + Ry: DataArray = self.preprocessor2.inverse_transform_scores(Ry) + return Rx, Ry + + def get_serialization_attrs(self) -> dict: + """Get the attributes needed to serialize the model. + + Returns + ------- + dict + Attributes needed to serialize the model. + + """ + return dict( + data=self.data, + preprocessor1=self.preprocessor1, + preprocessor2=self.preprocessor2, + whitener1=self.whitener1, + whitener2=self.whitener2, + ) + + def _augment_data(self, X: DataArray, Y: DataArray) -> tuple[DataArray, DataArray]: + """Optional method to augment the data before fitting.""" + return X, Y + + def _process_parameter( + self, + parameter_name: str, + parameter: Sequence[GenericType] | GenericType | None, + default: GenericType, + ) -> Sequence[GenericType]: + n_datasets = 2 + if parameter is None: + parameter = [default] * n_datasets + elif not isinstance(parameter, (list, tuple)): + parameter = [parameter] * n_datasets + self._check_parameter_number(parameter_name, parameter) + return parameter + + @staticmethod + def _check_parameter_number( + parameter_name: str, parameter: Sequence[GenericType] + ) -> None: + if len(parameter) != 2: + err_msg = ( + f"Expected 2 items for '{parameter_name}', but got {len(parameter)}." + ) + raise ValueError(err_msg) diff --git a/xeofs/models/_base_model_single_set.py b/xeofs/models/_base_model_single_set.py new file mode 100644 index 0000000..5f893a7 --- /dev/null +++ b/xeofs/models/_base_model_single_set.py @@ -0,0 +1,340 @@ +import warnings +from abc import abstractmethod +from typing import ( + Hashable, + Sequence, +) + +import xarray as xr +from typing_extensions import Self + +from ..data_container import DataContainer +from ..preprocessing.preprocessor import Preprocessor +from ..utils.data_types import DataArray, DataObject +from ..utils.sanity_checks import validate_input_type +from ..utils.xarray_utils import ( + convert_to_dim_type, +) +from ._base_model import _BaseModel + +xr.set_options(keep_attrs=True) + + +class _BaseModelSingleSet(_BaseModel): + """ + Abstract base class for single-set models. + + Parameters + ---------- + n_modes: int, default=10 + Number of modes to calculate. + center: bool, default=True + Whether to center the input data. + standardize: bool, default=False + Whether to standardize the input data. + use_coslat: bool, default=False + Whether to use cosine of latitude for scaling. + check_nans : bool, default=True + If True, remove full-dimensional NaN features from the data, check to ensure + that NaN features match the original fit data during transform, and check + for isolated NaNs. Note: this forces eager computation of dask arrays. + If False, skip all NaN checks. In this case, NaNs should be explicitly removed + or filled prior to fitting, or SVD will fail. + sample_name: str, default="sample" + Name of the sample dimension. + feature_name: str, default="feature" + Name of the feature dimension. + compute : bool, default=True + Whether to compute elements of the model eagerly, or to defer computation. + If True, four pieces of the fit will be computed sequentially: 1) the + preprocessor scaler, 2) optional NaN checks, 3) SVD decomposition, 4) scores + and components. + verbose: bool, default=False + Whether to show a progress bar when computing the decomposition. + random_state: int | None, default=None + Seed for the random number generator. + solver: {"auto", "full", "randomized"}, default="auto" + Solver to use for the SVD computation. + solver_kwargs: dict, default={} + Additional keyword arguments to pass to the SVD solver function. + + """ + + def __init__( + self, + n_modes=10, + center=True, + standardize=False, + use_coslat=False, + check_nans=True, + sample_name="sample", + feature_name="feature", + compute=True, + verbose=False, + random_state=None, + solver="auto", + solver_kwargs={}, + ): + if verbose: + warnings.warn( + "The 'verbose' parameter is deprecated and will be removed in a future release.", + category=DeprecationWarning, + stacklevel=3, + ) + + super().__init__() + + self.n_modes = n_modes + self.sample_name = sample_name + self.feature_name = feature_name + + # Define model parameters + self._params = { + "n_modes": n_modes, + "center": center, + "standardize": standardize, + "use_coslat": use_coslat, + "check_nans": check_nans, + "sample_name": sample_name, + "feature_name": feature_name, + "random_state": random_state, + "verbose": verbose, + "compute": compute, + "solver": solver, + "solver_kwargs": solver_kwargs, + } + self._decomposer_kwargs = { + "n_modes": n_modes, + "solver": solver, + "random_state": random_state, + "compute": compute, + "verbose": verbose, + "solver_kwargs": solver_kwargs, + } + + # Define analysis-relevant meta data + self.attrs.update({"model": "BaseModelSingleSet"}) + self.attrs.update(self._params) + + # Initialize the Preprocessor to scale and stack the data + self.preprocessor = Preprocessor( + sample_name=sample_name, + feature_name=feature_name, + with_center=center, + with_std=standardize, + with_coslat=use_coslat, + check_nans=check_nans, + compute=compute, + ) + # Initialize the data container that stores the results + self.data = DataContainer() + + def get_serialization_attrs(self) -> dict: + return dict( + data=self.data, + preprocessor=self.preprocessor, + ) + + def fit( + self, + X: DataObject, + dim: Sequence[Hashable] | Hashable, + weights: DataObject | None = None, + ) -> Self: + """ + Fit the model to the input data. + + Parameters + ---------- + X: DataObject + Input data. + dim: Sequence[Hashable] | Hashable + Specify the sample dimensions. The remaining dimensions + will be treated as feature dimensions. + weights: DataObject | None, default=None + Weighting factors for the input data. + + """ + # Check for invalid types + validate_input_type(X) + if weights is not None: + validate_input_type(weights) + + self.sample_dims = convert_to_dim_type(dim) + + # Preprocess the data & transform to 2D + data2D: DataArray = self.preprocessor.fit_transform( + X, self.sample_dims, weights + ) + + self._fit_algorithm(data2D) + + if self._params["compute"]: + self.data.compute() + + return self + + @abstractmethod + def _fit_algorithm(self, data: DataArray) -> Self: + """Fit the model to the input data assuming a 2D DataArray. + + Parameters + ---------- + data: DataArray + Input data with dimensions (sample_name, feature_name) + + Returns + ------- + self: Self + The fitted model. + + """ + raise NotImplementedError + + def transform(self, data: DataObject, normalized=True) -> DataArray: + """Project data onto the components. + + Parameters + ---------- + data: DataObject + Data to be transformed. + normalized: bool, default=True + Whether to normalize the scores by the L2 norm. + + Returns + ------- + projections: DataArray + Projections of the data onto the components. + + """ + validate_input_type(data) + + data2D = self.preprocessor.transform(data) + data2D = self._transform_algorithm(data2D) + if normalized: + data2D = data2D / self.data["norms"] + data2D.name = "scores" + return self.preprocessor.inverse_transform_scores_unseen(data2D) + + @abstractmethod + def _transform_algorithm(self, data: DataArray) -> DataArray: + """Project data onto the components. + + Parameters + ---------- + data: DataArray + Input data with dimensions (sample_name, feature_name) + + Returns + ------- + projections: DataArray + Projections of the data onto the components. + + """ + raise NotImplementedError + + def fit_transform( + self, + data: DataObject, + dim: Sequence[Hashable] | Hashable, + weights: DataObject | None = None, + **kwargs, + ) -> DataArray: + """Fit the model to the input data and project the data onto the components. + + Parameters + ---------- + data: DataObject + Input data. + dim: Sequence[Hashable] | Hashable + Specify the sample dimensions. The remaining dimensions + will be treated as feature dimensions. + weights: DataObject | None, default=None + Weighting factors for the input data. + **kwargs + Additional keyword arguments to pass to the transform method. + + Returns + ------- + projections: DataArray + Projections of the data onto the components. + + """ + return self.fit(data, dim, weights).transform(data, **kwargs) + + def inverse_transform( + self, scores: DataArray, normalized: bool = True + ) -> DataObject: + """Reconstruct the original data from transformed data. + + Parameters + ---------- + scores: DataArray + Transformed data to be reconstructed. This could be a subset + of the `scores` data of a fitted model, or unseen data. Must + have a 'mode' dimension. + normalized: bool, default=True + Whether the scores data have been normalized by the L2 norm. + + Returns + ------- + data: DataObject + Reconstructed data. + + """ + if normalized: + norms = self.data["norms"].sel(mode=scores.mode) + scores = scores * norms + + # Handle scalar mode in xr.dot + if "mode" not in scores.dims: + scores = scores.expand_dims("mode") + + data_reconstructed = self._inverse_transform_algorithm(scores) + + # Reconstructing the data using a single mode introduces a + # redundant "mode" coordinate + if "mode" in data_reconstructed.coords: + data_reconstructed = data_reconstructed.drop_vars("mode") + + return self.preprocessor.inverse_transform_data(data_reconstructed) + + @abstractmethod + def _inverse_transform_algorithm(self, scores: DataArray) -> DataArray: + """Reconstruct the original data from transformed data. + + Parameters + ---------- + scores: DataObject + Transformed data to be reconstructed. This could be a subset + of the `scores` data of a fitted model, or unseen data. Must + have a 'mode' dimension. + + Returns + ------- + data: DataArray + Reconstructed 2D data with dimensions (sample_name, feature_name) + + """ + raise NotImplementedError + + def components(self) -> DataObject: + """Get the components.""" + components = self.data["components"] + return self.preprocessor.inverse_transform_components(components) + + def scores(self, normalized=True) -> DataArray: + """Get the scores. + + Parameters + ---------- + normalized: bool, default=True + Whether to normalize the scores by the L2 norm. + """ + scores = self.data["scores"].copy() + if normalized: + attrs = scores.attrs.copy() + scores = scores / self.data["norms"] + scores.attrs.update(attrs) + scores.name = "scores" + return self.preprocessor.inverse_transform_scores(scores) diff --git a/xeofs/models/_np_classes/_svd.py b/xeofs/models/_np_classes/_svd.py new file mode 100644 index 0000000..b6e9329 --- /dev/null +++ b/xeofs/models/_np_classes/_svd.py @@ -0,0 +1,266 @@ +import warnings + +import numpy as np +from dask.array import Array as DaskArray # type: ignore +from dask.array.linalg import svd_compressed as dask_svd +from dask.graph_manipulation import wait_on +from scipy.sparse.linalg import svds as complex_svd # type: ignore +from sklearn.utils.extmath import randomized_svd + +from ...utils.sanity_checks import sanity_check_n_modes + + +def get_deterministic_sign_multiplier(data, axis: int): + """Compute a sign multiplier that ensures deterministic output using Dask. + + Parameters: + ------------ + data: da.Array + Input data to determine sorting order. + axis: int + Axis along which to compute the sign multiplier. + + Returns: + --------- + sign_multiplier: da.Array + Sign multiplier that ensures deterministic output. + """ + max_vals = np.max(data, axis=axis) + min_vals = np.min(data, axis=axis) + sign_multiplier = np.where(np.abs(max_vals) >= np.abs(min_vals), 1, -1) + return sign_multiplier + + +class _SVD: + """Decomposes a data object using Singular Value Decomposition (SVD). + + The data object will be decomposed like X = U * S * V.T, where U and V are + orthogonal matrices and S is a diagonal matrix with the singular values on + the diagonal. + + Parameters + ---------- + n_modes : int | float | str + Number of components to be computed. If float, it represents the fraction of variance to keep. If "all", all components are kept. + init_rank_reduction : float, default=0.0 + Used only when `n_modes` is given as a float. Specifiy the initial target rank to be computed by randomized SVD before truncating the solution to the desired fraction of explained variance. Must be in the half open interval (0, 1]. Lower values will speed up the computation. If the rank is too low and the fraction of explained variance is not reached, a warning will be raised. + flip_signs : bool, default=True + Whether to flip the sign of the components to ensure deterministic output. + solver : {'auto', 'full', 'randomized'}, default='auto' + The solver is selected by a default policy based on size of `X` and `n_modes`: + if the input data is larger than 500x500 and the number of modes to extract is lower + than 80% of the smallest dimension of the data, then the more efficient + `randomized` method is enabled. Otherwise the exact full SVD is computed + and optionally truncated afterwards. + random_state : np.random.Generator | int | None, default=None + Seed for the random number generator. + verbose: bool, default=False + Whether to show a progress bar when computing the decomposition. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver. + """ + + def __init__( + self, + n_modes: int | float | str, + init_rank_reduction: float = 0.3, + flip_signs: bool = True, + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + verbose: bool = False, + solver_kwargs: dict = {}, + ): + sanity_check_n_modes(n_modes) + self.is_based_on_variance = True if isinstance(n_modes, float) else False + + if self.is_based_on_variance: + if not (0 < init_rank_reduction <= 1.0): + raise ValueError( + "init_rank_reduction must be in the half open interval (0, 1]." + ) + + self.n_modes = n_modes + self.n_modes_precompute = n_modes + self.init_rank_reduction = init_rank_reduction + self.flip_signs = flip_signs + self.verbose = verbose + self.solver = solver + self.random_state = random_state + self.solver_kwargs = solver_kwargs + + def _get_n_modes_precompute(self, rank: int) -> int: + if self.is_based_on_variance: + n_modes_precompute = int(rank * self.init_rank_reduction) + if n_modes_precompute < 1: + warnings.warn( + f"`init_rank_reduction={self.init_rank_reduction}` is too low resulting in zero components. One component will be computed instead." + ) + n_modes_precompute = 1 + elif self.n_modes_precompute == "all": + n_modes_precompute = rank + + elif isinstance(self.n_modes_precompute, int): + if self.n_modes_precompute > rank: + raise ValueError( + f"n_modes must be less than or equal to the rank of the dataset (rank = {rank})." + ) + n_modes_precompute = self.n_modes_precompute + return n_modes_precompute + + def fit_transform(self, X): + """Decomposes the data object. + + Parameters + ---------- + X : DataArray + A 2-dimensional data object to be decomposed. + """ + rank = min(X.shape) + self.n_modes_precompute = self._get_n_modes_precompute(rank) + + # Check if data is small enough to use exact SVD + # If not, use randomized SVD + # If data is complex, use scipy sparse SVD + # If data is dask, use dask SVD + # Conditions for using exact SVD follow scitkit-learn's PCA implementation + # Source: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html + + use_dask = True if isinstance(X, DaskArray) else False + use_complex = True if np.iscomplexobj(X) else False + + is_small_data = max(X.shape) < 500 + + match self.solver: + # TODO(nicrie): implement more performant case for tall and skinny problems which are best handled by precomputing the covariance matrix. + # if X.shape[1] <= 1_000 and X.shape[0] >= 10 * X.shape[1]: -> covariance_eigh" (see sklearn PCA implementation: https://github.com/scikit-learn/scikit-learn/blob/e87b32a81c70abed8f2e97483758eb64df8255e9/sklearn/decomposition/_pca.py#L526) + case "auto": + has_many_modes = self.n_modes_precompute > int(0.8 * rank) + use_exact = ( + True if is_small_data and has_many_modes and not use_dask else False + ) + case "full": + use_exact = True + case "randomized": + use_exact = False + case _: + raise ValueError( + f"Unrecognized solver '{self.solver}'. " + "Valid options are 'auto', 'full', and 'randomized'." + ) + + # Use exact SVD for small data sets + if use_exact: + U, s, VT = self._svd(X, np.linalg.svd, self.solver_kwargs) + U = U[:, : self.n_modes_precompute] + s = s[: self.n_modes_precompute] + VT = VT[: self.n_modes_precompute, :] + + # Use randomized SVD for large, real-valued data sets + elif (not use_complex) and (not use_dask): + solver_kwargs = self.solver_kwargs | { + "n_components": self.n_modes_precompute, + "random_state": self.random_state, + } + U, s, VT = self._svd(X, randomized_svd, solver_kwargs) + + # Use scipy sparse SVD for large, complex-valued data sets + elif use_complex and (not use_dask): + # Scipy sparse version + solver_kwargs = self.solver_kwargs | { + "k": self.n_modes_precompute, + "solver": "lobpcg", + "random_state": self.random_state, + } + U, s, VT = self._svd(X, complex_svd, solver_kwargs) + idx_sort = np.argsort(s)[::-1] + U = U[:, idx_sort] + s = s[idx_sort] + VT = VT[idx_sort, :] + + # Use dask SVD for large, real-valued, delayed data sets + elif (not use_complex) and use_dask: + solver_kwargs = self.solver_kwargs | { + "k": self.n_modes_precompute, + "seed": self.random_state, + } + solver_kwargs.setdefault("compute", False) + solver_kwargs.setdefault("n_power_iter", 4) + U, s, VT = self._svd(X, dask_svd, solver_kwargs) + else: + err_msg = ( + "Complex data together with dask is currently not implemented. See dask issue 7639 " + "https://github.com/dask/dask/issues/7639" + ) + raise NotImplementedError(err_msg) + + U, s, VT = wait_on(U, s, VT) + + V = VT.conj().T + + # Flip signs of components to ensure deterministic output + if self.flip_signs: + sign_multiplier = get_deterministic_sign_multiplier(V, axis=0) + V *= sign_multiplier + U *= sign_multiplier + + # Truncate the decomposition to the desired number of modes + if self.is_based_on_variance: + # Truncating based on variance requires computation of dask array + # which we prefer to avoid + if use_dask: + err_msg = "Estimating the number of modes to keep based on variance is not supported with dask arrays. Please explicitly specifiy the number of modes to keep by using an integer for the number of modes." + raise ValueError(err_msg) + + # Compute the fraction of explained variance per mode + N = X.shape[0] - 1 + total_variance = X.var(axis=0, ddof=1).sum() + explained_variance = s**2 / N / total_variance + cum_expvar = explained_variance.cumsum() + total_explained_variance = cum_expvar[-1].item() + + n_modes_required = ( + self.n_modes_precompute - (cum_expvar >= self.n_modes).sum() + 1 + ) + if n_modes_required > self.n_modes_precompute: + warnings.warn( + f"Dataset has {self.n_modes_precompute} components, explaining {total_explained_variance:.2%} of the variance. However, {self.n_modes:.2%} explained variance was requested. Please consider increasing `init_rank_reduction`." + ) + n_modes_required = self.n_modes_precompute + + # Truncate solution to the desired fraction of explained variance + U = U[:, :n_modes_required] + s = s[:n_modes_required] + V = V[:, :n_modes_required] + + return U, s, V + + def _svd(self, X, func, kwargs): + """Performs SVD on the data + + Parameters + ---------- + X : DataArray + A 2-dimensional data object to be decomposed. + func : Callable + Method to perform SVD. + kwargs : dict + Additional keyword arguments passed to the SVD solver. + + Returns + ------- + U : DataArray + Left singular vectors. + s : DataArray + Singular values. + VT : DataArray + Right singular vectors. + """ + try: + U, s, VT = func(X, **kwargs) + return U, s, VT + except np.linalg.LinAlgError: + raise np.linalg.LinAlgError( + "SVD failed. This may be due to isolated NaN values in the data. Please consider the following steps:\n" + "1. Check for and remove any isolated NaNs in your dataset.\n" + "2. If the error persists, please raise an issue at https://github.com/xarray-contrib/xeofs/issues." + ) diff --git a/xeofs/models/cpcca.py b/xeofs/models/cpcca.py new file mode 100644 index 0000000..546e6c2 --- /dev/null +++ b/xeofs/models/cpcca.py @@ -0,0 +1,1464 @@ +import warnings +from typing import Dict, Optional, Sequence, Tuple + +import numpy as np +import xarray as xr +from typing_extensions import Self + +from ..utils.data_types import DataArray, DataObject +from ..utils.hilbert_transform import hilbert_transform +from ..utils.linalg import fractional_matrix_power +from ..utils.statistics import pearson_correlation +from ..utils.xarray_utils import argsort_dask +from ._base_model_cross_set import _BaseModelCrossSet +from .decomposer import Decomposer + + +class CPCCA(_BaseModelCrossSet): + """Continuum Power CCA (CPCCA). + + CPCCA extends continuum power regression to isolate pairs of coupled + patterns, maximizing the squared covariance between partially whitened + variables [1]_ [2]_. + + This method solves the following optimization problem: + + :math:`\\max_{q_x, q_y} \\left( q_x^T X^T Y q_y \\right)` + + subject to the constraints: + + :math:`q_x^T (X^TX)^{1-\\alpha_x} q_x = 1, \\quad q_y^T + (Y^TY)^{1-\\alpha_y} q_y = 1` + + where :math:`\\alpha_x` and :math:`\\alpha_y` control the degree of + whitening applied to the data. + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + alpha : Sequence[float] | float, default=0.2 + Degree of whitening applied to the data. If float, the same value is + applied to both data sets. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is then computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. + compute : bool, default=True + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. + + Notes + ----- + Canonical Correlation Analysis (CCA), Maximum Covariance Analysis (MCA) and + Redundany Analysis (RDA) are all special cases of CPCCA depending on the + choice of the parameter :math:`\\alpha`. + + References + ---------- + .. [1] Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + .. [2] Wilks, D. S. Statistical Methods in the Atmospheric Sciences. + (Academic Press, 2019). + doi:https://doi.org/10.1016/B978-0-12-815823-4.00011-0. + + Examples + -------- + Perform regular CCA on two data sets: + + >>> model = CPCCA(n_modes=5, alpha=0.0) + >>> model.fit(X, Y) + + Perform regularized CCA on two data sets: + + >>> model = CPCCA(n_modes=5, alpha=0.2) + >>> model.fit(X, Y) + + Perform Maximum Covariance Analysis: + + >>> model = CPCCA(n_modes=5, alpha=1.0) + >>> model.fit(X, Y) + + Perform Redundancy Analysis: + + >>> model = CPCCA(n_modes=5, alpha=[0, 1]) + >>> model.fit(X, Y) + + Make predictions for `Y` given `X`: + + >>> scores_y_pred = model.predict(X) # prediction in "PC" space + >>> Y_pred = model.inverse_transform(Y=scores_y_pred) # prediction in physical space + + + """ + + def __init__( + self, + n_modes: int = 2, + alpha: Sequence[float] | float = 0.2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + check_nans: Sequence[bool] | bool = True, + compute: bool = True, + verbose: bool = False, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + **kwargs, + ): + super().__init__( + n_modes=n_modes, + center=True, + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + use_pca=use_pca, + n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, + alpha=alpha, + compute=compute, + verbose=verbose, + sample_name=sample_name, + feature_name=feature_name, + solver=solver, + random_state=random_state, + solver_kwargs=solver_kwargs, + ) + self.attrs.update({"model": "Continuum Power CCA"}) + # Remove center from the inherited serialization params because it is hard-coded for CPCCA + self._params.pop("center") + + params = self.get_params() + self.sample_name: str = params["sample_name"] + self.feature_name: tuple[str, str] = params["feature_name"] + + def _fit_algorithm( + self, + X: DataArray, + Y: DataArray, + ) -> Self: + feature_name = self.feature_name + + # Compute the totalsquared covariance from the unwhitened data + C_whitened = self._compute_cross_matrix( + X, + Y, + sample_dim=self.sample_name, + feature_dim_x=feature_name[0], + feature_dim_y=feature_name[1], + method="covariance", + diagonal=False, + ) + + # Initialize the SVD decomposer + decomposer = Decomposer(**self._decomposer_kwargs) + dims = (feature_name[0], feature_name[1]) + decomposer.fit(C_whitened, dims=dims) + + # Store the results + singular_values = decomposer.s_ + Q1 = decomposer.U_ + Q2 = decomposer.V_ + + # Compute total squared variance + total_squared_covariance = self._compute_total_squared_covariance(C_whitened) + + # Index of the sorted covariance explained + idx_sorted_modes = argsort_dask(singular_values, "mode")[::-1] # type: ignore + idx_sorted_modes.coords.update(singular_values.coords) + + # Project the data onto the singular vectors + scores1 = xr.dot(X, Q1, dims=feature_name[0]) + scores2 = xr.dot(Y, Q2, dims=feature_name[1]) + + norm1 = np.sqrt(xr.dot(scores1.conj(), scores1, dims=self.sample_name)).real + norm2 = np.sqrt(xr.dot(scores2.conj(), scores2, dims=self.sample_name)).real + + self.data.add(name="input_data1", data=X, allow_compute=False) + self.data.add(name="input_data2", data=Y, allow_compute=False) + self.data.add(name="components1", data=Q1) + self.data.add(name="components2", data=Q2) + self.data.add(name="scores1", data=scores1) + self.data.add(name="scores2", data=scores2) + self.data.add(name="singular_values", data=singular_values) + self.data.add(name="squared_covariance", data=singular_values**2) + self.data.add(name="total_squared_covariance", data=total_squared_covariance) + self.data.add(name="idx_modes_sorted", data=idx_sorted_modes) + self.data.add(name="norm1", data=norm1) + self.data.add(name="norm2", data=norm2) + + # # Assign analysis-relevant meta data + self.data.set_attrs(self.attrs) + return self + + def _transform_algorithm( + self, + X: Optional[DataArray] = None, + Y: Optional[DataArray] = None, + normalized=False, + ) -> Dict[str, DataArray]: + results = {} + if X is not None: + # Project data onto singular vectors + comps1 = self.data["components1"] + norm1 = self.data["norm1"] + scores1 = xr.dot(X, comps1) + if normalized: + scores1 = scores1 / norm1 + results["data1"] = scores1 + + if Y is not None: + # Project data onto singular vectors + comps2 = self.data["components2"] + norm2 = self.data["norm2"] + scores2 = xr.dot(Y, comps2) + if normalized: + scores2 = scores2 / norm2 + results["data2"] = scores2 + + return results + + def _inverse_transform_algorithm( + self, X: DataArray | None = None, Y: DataArray | None = None + ) -> dict[str, DataArray]: + x_is_given = X is not None + y_is_given = Y is not None + + if (not x_is_given) and (not y_is_given): + raise ValueError("Either X or Y must be given.") + + results = {} + + if x_is_given: + # Singular vectors + comps1 = self.data["components1"].sel(mode=X.mode) + # Reconstruct the data + results["X"] = xr.dot(X, comps1.conj(), dims="mode") + + if y_is_given: + # Singular vectors + comps2 = self.data["components2"].sel(mode=Y.mode) + # Reconstruct the data + results["Y"] = xr.dot(Y, comps2.conj(), dims="mode") + + return results + + def _predict_algorithm(self, X: DataArray) -> DataArray: + sample_name_fit_x = "sample_fit_dim_x" + sample_name_fit_y = "sample_fit_dim_y" + Qx = self.data["components1"] + Rx = self.data["scores1"].rename({self.sample_name: sample_name_fit_x}) + Ry = self.data["scores2"].rename({self.sample_name: sample_name_fit_y}) + + def _predict_numpy(X, Qx, Rx, Ry): + G = Rx.conj().T @ Ry / np.linalg.norm(Rx, axis=0) ** 2 + return X @ Qx @ G + + Ry_pred = xr.apply_ufunc( + _predict_numpy, + X, + Qx, + Rx, + Ry, + input_core_dims=[ + [self.sample_name, self.feature_name[0]], + [self.feature_name[0], "mode"], + [sample_name_fit_x, "mode"], + [sample_name_fit_y, "mode"], + ], + output_core_dims=[[self.sample_name, "mode"]], + dask="allowed", + ) + Ry_pred.name = "pseudo_scores_Y" + return Ry_pred + + def _get_components(self, normalized=True): + comps1 = self.data["components1"] + comps2 = self.data["components2"] + + if not normalized: + comps1 = comps1 * self.data["norm1"] + comps2 = comps2 * self.data["norm2"] + + return comps1, comps2 + + def _get_scores(self, normalized=False): + norm1 = self.data["norm1"] + norm2 = self.data["norm2"] + + scores1 = self.data["scores1"] + scores2 = self.data["scores2"] + + if normalized: + scores1 = scores1 / norm1 + scores2 = scores2 / norm2 + + return scores1, scores2 + + def cross_correlation_coefficients(self): + """Get the cross-correlation coefficients. + + The cross-correlation coefficients between the scores of ``X`` and ``Y`` + are computed as: + + .. math:: + c_{xy, i} = \\text{corr} \\left(\\mathbf{r}_{x, i}, \\mathbf{r}_{y, i} \\right) + + where :math:`\\mathbf{r}_{x, i}` and :math:`\\mathbf{r}_{y, i}` are the + `i`th scores of ``X`` and ``Y``, + + Notes + ----- + When :math:`\\alpha=0`, the cross-correlation coefficients are + equivalent to the canonical correlation coefficients. + + """ + + Rx = self.data["scores1"] + Ry = self.data["scores2"] + + cross_corr = self._compute_cross_matrix( + Rx, + Ry, + sample_dim=self.sample_name, + feature_dim_x="mode", + feature_dim_y="mode", + method="correlation", + diagonal=True, + ) + cross_corr = cross_corr.real + cross_corr.name = "cross_correlation_coefficients" + return cross_corr + + def correlation_coefficients_X(self): + """Get the correlation coefficients for the scores of :math:`X`. + + The correlation coefficients of the scores of :math:`X` are given by: + + .. math:: + c_{x, ij} = \\text{corr} \\left(\\mathbf{r}_{x, i}, \\mathbf{r}_{x, j} \\right) + + where :math:`\\mathbf{r}_{x, i}` and :math:`\\mathbf{r}_{x, j}` are the + `i`th and `j`th scores of :math:`X`. + + """ + Rx = self.data["scores1"] + + corr = self._compute_cross_matrix( + Rx, + Rx, + sample_dim=self.sample_name, + feature_dim_x="mode", + feature_dim_y="mode", + method="correlation", + diagonal=False, + ) + corr.name = "correlation_coefficients_X" + return corr + + def correlation_coefficients_Y(self): + """Get the correlation coefficients for the scores of :math:`Y`. + + The correlation coefficients of the scores of :math:`Y` are given by: + + .. math:: + c_{y, ij} = \\text{corr} \\left(\\mathbf{r}_{y, i}, \\mathbf{r}_{y, j} \\right) + + where :math:`\\mathbf{r}_{y, i}` and :math:`\\mathbf{r}_{y, j}` are the + `i`th and `j`th scores of :math:`Y`. + + """ + Ry = self.data["scores2"] + + corr = self._compute_cross_matrix( + Ry, + Ry, + sample_dim=self.sample_name, + feature_dim_x="mode", + feature_dim_y="mode", + method="correlation", + diagonal=False, + ) + corr.name = "correlation_coefficients_Y" + return corr + + def squared_covariance_fraction(self): + """Get the squared covariance fraction (SCF). + + The SCF is computed as a weighted mean-square error (see equation (15) + in Swenson (2015)) : + + .. math:: + SCF_{i} = 1 - \\frac{\\|\\mathbf{d}_{X,i}^T \\mathbf{d}_{Y,i}\\|_F^2}{\\|X^TY\\|_F^2} + + where :math:`\\mathbf{d}_{X,i}` and :math:`\\mathbf{d}_{Y,i}` are the + residuals of the input data :math:`X` and :math:`Y` after reconstruction + by the `ith` scores of :math:`X` and :math:`Y`, respectively. + + References + ---------- + Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + """ + + def _compute_residual_variance_numpy(X, Y, Xrec, Yrec): + dX = X - Xrec + dY = Y - Yrec + + return np.linalg.norm(dX.conj().T @ dY / (dX.shape[0] - 1)) ** 2 + + total_squared_covariance = self.data["total_squared_covariance"] + sample_name_x = "sample_dim_x" + sample_name_y = "sample_dim_y" + + # Get the singular vectors + Q1 = self.data["components1"] + Q2 = self.data["components2"] + + # Get input data + X1 = self.data["input_data1"] + X2 = self.data["input_data2"] + + # Unwhiten the data + X1 = self.whitener1.inverse_transform_data(X1, unwhiten_only=True) + X2 = self.whitener2.inverse_transform_data(X2, unwhiten_only=True) + + # Rename the sample dimension to avoid conflicts for + # different coordinates with same length + X1 = X1.rename({self.sample_name: sample_name_x}) + X2 = X2.rename({self.sample_name: sample_name_y}) + + # Get the component scores + scores1 = self.data["scores1"] + scores2 = self.data["scores2"] + + # Compute the residual variance for each mode + squared_covariance_fraction: list[DataArray] = [] + for mode in scores1.mode.values: + # Reconstruct the data + X1r = xr.dot( + scores1.sel(mode=[mode]), Q1.sel(mode=[mode]).conj().T, dims="mode" + ) + X2r = xr.dot( + scores2.sel(mode=[mode]), Q2.sel(mode=[mode]).conj().T, dims="mode" + ) + + # Unwhitend the reconstructed data + X1r = self.whitener1.inverse_transform_data(X1r, unwhiten_only=True) + X2r = self.whitener2.inverse_transform_data(X2r, unwhiten_only=True) + + # Compute fraction variance explained + X1r = X1r.rename({self.sample_name: sample_name_x}) + X2r = X2r.rename({self.sample_name: sample_name_y}) + res_var: DataArray = xr.apply_ufunc( + _compute_residual_variance_numpy, + X1, + X2, + X1r, + X2r, + input_core_dims=[ + [sample_name_x, self.feature_name[0]], + [sample_name_y, self.feature_name[1]], + [sample_name_x, self.feature_name[0]], + [sample_name_y, self.feature_name[1]], + ], + output_core_dims=[[]], + dask="allowed", + ) + res_var = res_var.expand_dims({"mode": [mode]}) + squared_covariance_fraction.append(1 - res_var / total_squared_covariance) + + scf = xr.concat(squared_covariance_fraction, dim="mode") + scf.name = "squared_covariance_fraction" + + # In theory, the residual can be larger than the total squared covariance + # if a mode is not well defined. In this case, the SCF would be negative. + # We set these values to zero. + scf = xr.where(scf < 0, 0, scf) + return scf + + def fraction_variance_X_explained_by_X(self): + """Get the fraction of variance explained (FVE X). + + The FVE X is the fraction of variance in :math:`X` explained by the + scores of :math:`X`. It is computed as a weighted mean-square error (see + equation (15) in Swenson (2015)) : + + .. math:: + FVE_{X|X,i} = 1 - \\frac{\\|\\mathbf{d}_{X,i}\\|_F^2}{\\|X\\|_F^2} + + where :math:`\\mathbf{d}_{X,i}` are the residuals of the input data + :math:`X` after reconstruction by the `ith` scores of :math:`X`. + + References + ---------- + Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + """ + # Get the singular vectors + Qx = self.data["components1"] + + # Get input data + X = self.data["input_data1"] + + # Unwhiten the data + X = self.whitener1.inverse_transform_data(X, unwhiten_only=True) + + # Compute the total variance + total_variance: DataArray = self._compute_total_variance(X, self.sample_name) + + # Get the component scores + Rx = self.data["scores1"] + + # Compute the residual variance for each mode + fraction_variance_explained: list[DataArray] = [] + for mode in Rx.mode.values: + # Reconstruct the data + Xr = xr.dot(Rx.sel(mode=[mode]), Qx.sel(mode=[mode]).conj().T, dims="mode") + + # Unwhitend the reconstructed data + Xr = self.whitener1.inverse_transform_data(Xr, unwhiten_only=True) + + # Compute fraction variance explained + residual_variance = self._compute_total_variance(X - Xr, self.sample_name) + residual_variance = residual_variance.expand_dims({"mode": [mode]}) + fraction_variance_explained.append(1 - residual_variance / total_variance) + + fve_xx = xr.concat(fraction_variance_explained, dim="mode") + fve_xx.name = "fraction_variance_X_explained_by_X" + return fve_xx + + def fraction_variance_Y_explained_by_Y(self): + """Get the fraction of variance explained (FVE Y). + + The FVE Y is the fraction of variance in :math:`Y` explained by the + scores of :math:`Y`. It is computed as a weighted mean-square error (see + equation (15) in Swenson (2015)) : + + .. math:: + FVE_{Y|Y,i} = 1 - \\frac{\\|\\mathbf{d}_{Y,i}\\|_F^2}{\\|Y\\|_F^2} + + where :math:`\\mathbf{d}_{Y,i}` are the residuals of the input data + :math:`Y` after reconstruction by the `ith` scores of :math:`Y`. + + References + ---------- + Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + """ + # Get the singular vectors + Qy = self.data["components2"] + + # Get input data + Y = self.data["input_data2"] + + # Unwhiten the data + Y = self.whitener2.inverse_transform_data(Y, unwhiten_only=True) + + # Compute the total variance + total_variance: DataArray = self._compute_total_variance(Y, self.sample_name) + + # Get the component scores + Ry = self.data["scores2"] + + # Compute the residual variance for each mode + fraction_variance_explained: list[DataArray] = [] + for mode in Ry.mode.values: + # Reconstruct the data + Yr = xr.dot(Ry.sel(mode=[mode]), Qy.sel(mode=[mode]).conj().T, dims="mode") + + # Unwhitend the reconstructed data + Yr = self.whitener2.inverse_transform_data(Yr, unwhiten_only=True) + + # Compute fraction variance explained + residual_variance = self._compute_total_variance(Y - Yr, self.sample_name) + residual_variance = residual_variance.expand_dims({"mode": [mode]}) + fraction_variance_explained.append(1 - residual_variance / total_variance) + + fve_yy = xr.concat(fraction_variance_explained, dim="mode") + fve_yy.name = "fraction_variance_Y_explained_by_Y" + return fve_yy + + def fraction_variance_Y_explained_by_X(self) -> DataArray: + """Get the fraction of variance explained (FVE YX). + + The FVE YX is the fraction of variance in :math:`Y` explained by the + scores of :math:`X`. It is computed as a weighted mean-square error (see + equation (15) in Swenson (2015)) : + + .. math:: + FVE_{Y|X,i} = 1 - \\frac{\\|(X^TX)^{-1/2} \\mathbf{d}_{X,i}^T \\mathbf{d}_{Y,i}\\|_F^2}{\\|(X^TX)^{-1/2} X^TY\\|_F^2} + + where :math:`\\mathbf{d}_{X,i}` and :math:`\\mathbf{d}_{Y,i}` are the + residuals of the input data :math:`X` and :math:`Y` after reconstruction + by the `ith` scores of :math:`X` and :math:`Y`, respectively. + + References + ---------- + Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + """ + + def _compute_total_variance_numpy(X, Y): + Cx = X.conj().T @ X / (X.shape[0] - 1) + Tinv = fractional_matrix_power(Cx, -0.5) + return np.linalg.norm(Tinv @ X.conj().T @ Y / (X.shape[0] - 1)) ** 2 + + def _compute_residual_variance_numpy(X, Y, Xrec, Yrec): + dX = X - Xrec + dY = Y - Yrec + + Cx = X.conj().T @ X / (X.shape[0] - 1) + Tinv = fractional_matrix_power(Cx, -0.5) + return np.linalg.norm(Tinv @ dX.conj().T @ dY / (dX.shape[0] - 1)) ** 2 + + sample_name_x = "sample_dim_x" + sample_name_y = "sample_dim_y" + + # Get the singular vectors + Q1 = self.data["components1"] + Q2 = self.data["components2"] + + # Get input data + X1 = self.data["input_data1"] + X2 = self.data["input_data2"] + + # Unwhiten the data + X1 = self.whitener1.inverse_transform_data(X1, unwhiten_only=True) + X2 = self.whitener2.inverse_transform_data(X2, unwhiten_only=True) + + # Compute the total variance + X1 = X1.rename({self.sample_name: sample_name_x}) + X2 = X2.rename({self.sample_name: sample_name_y}) + total_variance: DataArray = xr.apply_ufunc( + _compute_total_variance_numpy, + X1, + X2, + input_core_dims=[ + [sample_name_x, self.feature_name[0]], + [sample_name_y, self.feature_name[1]], + ], + output_core_dims=[[]], + dask="allowed", + ) + + # Get the component scores + scores1 = self.data["scores1"] + scores2 = self.data["scores2"] + + # Compute the residual variance for each mode + fraction_variance_explained: list[DataArray] = [] + for mode in scores1.mode.values: + # Reconstruct the data + X1r = xr.dot( + scores1.sel(mode=[mode]), Q1.sel(mode=[mode]).conj().T, dims="mode" + ) + X2r = xr.dot( + scores2.sel(mode=[mode]), Q2.sel(mode=[mode]).conj().T, dims="mode" + ) + + # Unwhitend the reconstructed data + X1r = self.whitener1.inverse_transform_data(X1r, unwhiten_only=True) + X2r = self.whitener2.inverse_transform_data(X2r, unwhiten_only=True) + + # Compute fraction variance explained + X1r = X1r.rename({self.sample_name: sample_name_x}) + X2r = X2r.rename({self.sample_name: sample_name_y}) + res_var: DataArray = xr.apply_ufunc( + _compute_residual_variance_numpy, + X1, + X2, + X1r, + X2r, + input_core_dims=[ + [sample_name_x, self.feature_name[0]], + [sample_name_y, self.feature_name[1]], + [sample_name_x, self.feature_name[0]], + [sample_name_y, self.feature_name[1]], + ], + output_core_dims=[[]], + dask="allowed", + ) + res_var = res_var.expand_dims({"mode": [mode]}) + fraction_variance_explained.append(1 - res_var / total_variance) + + fve_yx = xr.concat(fraction_variance_explained, dim="mode") + fve_yx.name = "fraction_variance_Y_explained_by_X" + return fve_yx + + def homogeneous_patterns(self, correction=None, alpha=0.05): + """Get the homogeneous correlation patterns. + + The homogeneous correlation patterns are the correlation coefficients + between the input data and the scores. They are defined as: + + .. math:: + H_{X, i} = \\text{corr} \\left(X, \\mathbf{r}_{x,i} \\right) + + .. math:: + H_{Y, i} = \\text{corr} \\left(Y, \\mathbf{r}_{y,i} \\right) + + where :math:`X` and :math:`Y` are the input data, and + :math:`\\mathbf{r}_{x,i}` and :math:`\\mathbf{r}_{y,i}` are the `i`th + scores of :math:`X` and :math:`Y`, respectively. + + + Parameters + ---------- + correction: str, default=None + Method to apply a multiple testing correction. If None, no + correction is applied. Available methods are: - bonferroni : + one-step correction - sidak : one-step correction - holm-sidak : + step down method using Sidak adjustments - holm : step-down method + using Bonferroni adjustments - simes-hochberg : step-up method + (independent) - hommel : closed method based on Simes tests + (non-negative) - fdr_bh : Benjamini/Hochberg (non-negative) + (default) - fdr_by : Benjamini/Yekutieli (negative) - fdr_tsbh : two + stage fdr correction (non-negative) - fdr_tsbky : two stage fdr + correction (non-negative) + alpha: float, default=0.05 + The desired family-wise error rate. Not used if `correction` is + None. + + Returns + ------- + tuple[DataObject, DataObject] + Homogenous correlation patterns of `X` and `Y`. + tuple[DataObject, DataObject] + p-values of the homogenous correlation patterns of `X` and `Y`. + + """ + input_data1 = self.data["input_data1"] + input_data2 = self.data["input_data2"] + + input_data1 = self.whitener1.inverse_transform_data(input_data1) + input_data2 = self.whitener2.inverse_transform_data(input_data2) + + scores1 = self.data["scores1"] + scores2 = self.data["scores2"] + + hom_pat1, pvals1 = pearson_correlation( + input_data1, + scores1, + correction=correction, + alpha=alpha, + sample_name=self.sample_name, + feature_name=self.feature_name[0], + ) + hom_pat2, pvals2 = pearson_correlation( + input_data2, + scores2, + correction=correction, + alpha=alpha, + sample_name=self.sample_name, + feature_name=self.feature_name[1], + ) + + hom_pat1.name = "left_homogeneous_patterns" + hom_pat2.name = "right_homogeneous_patterns" + + pvals1.name = "pvalues_of_left_homogeneous_patterns" + pvals2.name = "pvalues_of_right_homogeneous_patterns" + + hom_pat1 = self.preprocessor1.inverse_transform_components(hom_pat1) + hom_pat2 = self.preprocessor2.inverse_transform_components(hom_pat2) + + pvals1 = self.preprocessor1.inverse_transform_components(pvals1) + pvals2 = self.preprocessor2.inverse_transform_components(pvals2) + + return (hom_pat1, hom_pat2), (pvals1, pvals2) + + def heterogeneous_patterns(self, correction=None, alpha=0.05): + """Get the heterogeneous correlation patterns. + + The heterogeneous patterns are the correlation coefficients between the + input data and the scores of the other field: + + .. math:: + G_{X, i} = \\text{corr} \\left(X, \\mathbf{r}_{y,i} \\right) + + .. math:: + G_{Y, i} = \\text{corr} \\left(Y, \\mathbf{r}_{x,i} \\right) + + where :math:`X` and :math:`Y` are the input data, and + :math:`\\mathbf{r}_{x,i}` and :math:`\\mathbf{r}_{y,i}` are the `i`th + scores of :math:`X` and :math:`Y`, respectively. + + Parameters + ---------- + correction: str, default=None + Method to apply a multiple testing correction. If None, no + correction is applied. Available methods are: - bonferroni : + one-step correction - sidak : one-step correction - holm-sidak : + step down method using Sidak adjustments - holm : step-down method + using Bonferroni adjustments - simes-hochberg : step-up method + (independent) - hommel : closed method based on Simes tests + (non-negative) - fdr_bh : Benjamini/Hochberg (non-negative) + (default) - fdr_by : Benjamini/Yekutieli (negative) - fdr_tsbh : two + stage fdr correction (non-negative) - fdr_tsbky : two stage fdr + correction (non-negative) + alpha: float, default=0.05 + The desired family-wise error rate. Not used if `correction` is + None. + + Returns + ------- + tuple[DataObject, DataObject] + Heterogenous correlation patterns of `X` and `Y`. + tuple[DataObject, DataObject] + p-values of the heterogenous correlation patterns of `X` and `Y`. + + """ + input_data1 = self.data["input_data1"] + input_data2 = self.data["input_data2"] + + input_data1 = self.whitener1.inverse_transform_data(input_data1) + input_data2 = self.whitener2.inverse_transform_data(input_data2) + + scores1 = self.data["scores1"] + scores2 = self.data["scores2"] + + patterns1, pvals1 = pearson_correlation( + input_data1, + scores2, + correction=correction, + alpha=alpha, + sample_name=self.sample_name, + feature_name=self.feature_name[0], + ) + patterns2, pvals2 = pearson_correlation( + input_data2, + scores1, + correction=correction, + alpha=alpha, + sample_name=self.sample_name, + feature_name=self.feature_name[1], + ) + + patterns1.name = "left_heterogeneous_patterns" + patterns2.name = "right_heterogeneous_patterns" + + pvals1.name = "pvalues_of_left_heterogeneous_patterns" + pvals2.name = "pvalues_of_right_heterogeneous_patterns" + + patterns1 = self.preprocessor1.inverse_transform_components(patterns1) + patterns2 = self.preprocessor2.inverse_transform_components(patterns2) + + pvals1 = self.preprocessor1.inverse_transform_components(pvals1) + pvals2 = self.preprocessor2.inverse_transform_components(pvals2) + + return (patterns1, patterns2), (pvals1, pvals2) + + def _validate_loaded_data(self, data: xr.DataArray): + if data.attrs.get("placeholder"): + warnings.warn( + f"The input data field '{data.name}' was not saved, which will produce" + " empty results when calling `homogeneous_patterns()` or " + "`heterogeneous_patterns()`. To avoid this warning, you can save the" + " model with `save_data=True`, or add the data manually by running" + " it through the model's `preprocessor.transform()` method and then" + " attaching it with `data.add()`." + ) + + def _compute_cross_matrix( + self, + X: DataArray, + Y: DataArray, + sample_dim: str, + feature_dim_x: str, + feature_dim_y: str, + method: str = "covariance", + diagonal: bool = False, + ) -> DataArray: + """Compute the cross matrix of two data objects. + + Assume centered data. + + Parameters + ---------- + X, Y : DataArray + DataArrays to compute the cross matrix from. + sample_dim : str + Name of the sample dimension. + feature_dim_x, feature_dim_y : str + Name of the feature dimensions. If the feature dimensions are the same, they are renamed to avoid conflicts. + method : {"covariance", "correlation"} + Method to compute the cross matrix. + diagonal : bool, default=False + Whether to compute the diagonal of the cross matrix. + + Returns + ------- + DataArray + The cross matrix of the two data objects. + + """ + if feature_dim_x == feature_dim_y: + new_feature_dim_x = feature_dim_x + "_x" + new_feature_dim_y = feature_dim_y + "_y" + X = X.rename({feature_dim_x: new_feature_dim_x}) + Y = Y.rename({feature_dim_y: new_feature_dim_y}) + feature_dim_x = new_feature_dim_x + feature_dim_y = new_feature_dim_y + + # Rename the sample dimension to avoid conflicts for + # different coordinates with same length + sample_dim_x = sample_dim + "_x" + sample_dim_y = sample_dim + "_y" + X = X.rename({sample_dim: sample_dim_x}) + Y = Y.rename({sample_dim: sample_dim_y}) + + if method == "correlation": + X = self._normalize_data(X, sample_dim_x) + Y = self._normalize_data(Y, sample_dim_y) + + if diagonal: + return xr.apply_ufunc( + self._compute_cross_covariance_diagonal_numpy, + X, + Y, + input_core_dims=[ + [sample_dim_x, feature_dim_x], + [sample_dim_y, feature_dim_y], + ], + output_core_dims=[[feature_dim_y]], + dask="allowed", + ).rename({feature_dim_y: feature_dim_y[:-2]}) + else: + return xr.apply_ufunc( + self._compute_cross_covariance_numpy, + X, + Y, + input_core_dims=[ + [sample_dim_x, feature_dim_x], + [sample_dim_y, feature_dim_y], + ], + output_core_dims=[[feature_dim_x, feature_dim_y]], + dask="allowed", + ) + + def _compute_cross_covariance_diagonal_numpy(self, X, Y): + # Assume centered data + return np.diag(self._compute_cross_covariance_numpy(X, Y)) + + def _compute_total_squared_covariance(self, C: DataArray) -> DataArray: + """Compute the total squared covariance. + + Requires the unwhitened covariance matrix which we can obtain by multiplying the whitened covariance matrix with the inverse of the whitening transformation matrix. + """ + C = self.whitener2.inverse_transform_data(C) + C = self.whitener1.inverse_transform_data(C.conj().T) + # Not necessary to conjugate transpose for total squared covariance + # C = C.conj().T + return (abs(C) ** 2).sum() + + @staticmethod + def _compute_total_variance(X: DataArray, dim: str) -> DataArray: + """Compute the total variance of the centered data.""" + return (X * X.conj()).sum() / (X[dim].size - 1) + + @staticmethod + def _compute_cross_covariance_numpy(X, Y): + # Assume centered data + n_samples_x = X.shape[0] + n_samples_y = Y.shape[0] + if n_samples_x != n_samples_y: + err_msg = f"Both data matrices must have the same number of samples but found {n_samples_x} in the first and {n_samples_y} in the second." + raise ValueError(err_msg) + return X.conj().T @ Y / (n_samples_x - 1) + + @staticmethod + def _normalize_data(X, dim): + # Assume centered data + return X / X.std(dim) + + +class ComplexCPCCA(CPCCA): + """Complex CPCCA. + + Complex CPCCA extends classical CPCCA [1]_ by examining amplitude-phase + relationships. It is based on complex-valued fields obtained from a pair of + variables such as the zonal and meridional components, :math:`U` and + :math:`V`, of the wind field, leading to complex-valued data matrices: + + .. math:: + X = U_x + iV_x + + and + + .. math:: + Y = U_y + iV_y + + This method solves the following optimization problem: + + :math:`\\max_{q_x, q_y} \\left( q_x^H X^H Y q_y \\right)` + + subject to the constraints: + + :math:`q_x^H (X^HX)^{1-\\alpha_x} q_x = 1, \\quad q_y^H + (Y^HY)^{1-\\alpha_y} q_y = 1` + + where :math:`H` denotes the conjugate transpose, :math:`X` and :math:`Y` are + the complex-valued data matrices, and :math:`\\alpha_x` and + :math:`\\alpha_y` control the degree of whitening applied to the data. + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + alpha : Sequence[float] | float, default=0.2 + Degree of whitening applied to the data. If float, the same value is + applied to both data sets. + padding : Sequence[str] | str | None, default="exp" + Padding method for the Hilbert transform. Available options are: - None: + no padding - "exp": exponential decay + decay_factor : Sequence[float] | float, default=0.2 + Decay factor for the exponential padding. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. + compute : bool, default=True + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. + + Examples + -------- + + With two DataArrays :math:`u_i` and :math:`v_i` representing the zonal and + meridional components of the wind field for two different regions :math:`x` + and :math:`y`, construct + + >>> X = u_x + 1j * v_x + >>> Y = u_y + 1j * v_y + + and fit the Complex CPCCA model: + + >>> model = ComplexCPCCA(n_modes=5) + >>> model.fit(X, Y, "time") + + Finally, extract the amplitude and phase patterns: + + >>> amp_x, amp_y = model.components_amplitude() + >>> phase_x, phase_y = model.components_phase() + + + References + ---------- + .. [1] Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + """ + + def __init__( + self, + n_modes: int = 2, + alpha: Sequence[float] | float = 0.2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + compute: bool = True, + verbose: bool = True, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + ): + CPCCA.__init__( + self, + n_modes=n_modes, + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + use_pca=use_pca, + n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, + alpha=alpha, + compute=compute, + verbose=verbose, + sample_name=sample_name, + feature_name=feature_name, + solver=solver, + random_state=random_state, + solver_kwargs=solver_kwargs, + ) + self.attrs.update({"model": "Complex CPCCA"}) + + def _fit_algorithm(self, X: DataArray, Y: DataArray) -> Self: + if (not np.iscomplexobj(X)) or (not np.iscomplexobj(Y)): + warnings.warn( + "Expected complex-valued data but found real-valued data. For Hilbert model, use corresponding `Hilbert` class." + ) + + return super()._fit_algorithm(X, Y) + + def components_amplitude(self, normalized=True) -> Tuple[DataObject, DataObject]: + """Get the amplitude of the components. + + The amplitudes of the components are defined as + + .. math:: + A_{x, ij} = |p_{x, ij}| + .. math:: + A_{y, ij} = |p_{y, ij}| + + where :math:`p_{ij}` is the :math:`i`-th entry of the :math:`j`-th + component and :math:`|\\cdot|` denotes the absolute value. + + Returns + ------- + tuple[DataObject, DataObject] + Component amplitudes of :math:`X` and :math:`Y`. + + """ + Px, Py = self._get_components(normalized=normalized) + + Px = self.whitener1.inverse_transform_components(Px) + Py = self.whitener2.inverse_transform_components(Py) + + Px = abs(Px) + Py = abs(Py) + + Px.name = "components_amplitude_X" + Py.name = "components_amplitude_Y" + + Px = self.preprocessor1.inverse_transform_components(Px) + Py = self.preprocessor2.inverse_transform_components(Py) + + return Px, Py + + def components_phase(self, normalized=True) -> tuple[DataObject, DataObject]: + """Get the phase of the components. + + The phases of the components are defined as + + .. math:: + \\phi_{x, ij} = \\arg(p_{x, ij}) + .. math:: + \\phi_{y, ij} = \\arg(p_{y, ij}) + + where :math:`p_{ij}` is the :math:`i`-th entry of the :math:`j`-th component and + :math:`\\arg(\\cdot)` denotes the argument of a complex number. + + Returns + ------- + tuple[DataObject, DataObject] + Component phases of :math:`X` and :math:`Y`. + + """ + Px, Py = self._get_components(normalized=normalized) + + Px = self.whitener1.inverse_transform_components(Px) + Py = self.whitener2.inverse_transform_components(Py) + + Px = xr.apply_ufunc(np.angle, Px, keep_attrs=True) + Py = xr.apply_ufunc(np.angle, Py, keep_attrs=True) + + Px.name = "components_phase_X" + Py.name = "components_phase_Y" + + Px = self.preprocessor1.inverse_transform_components(Px) + Py = self.preprocessor2.inverse_transform_components(Py) + + return Px, Py + + def scores_amplitude(self, normalized=False) -> Tuple[DataArray, DataArray]: + """Get the amplitude of the scores. + + The amplitudes of the scores are defined as + + .. math:: + A_{x, ij} = |r_{y, ij}| + .. math:: + A_{y, ij} = |r_{x, ij}| + + where :math:`r_{ij}` is the :math:`i`-th entry of the :math:`j`-th score and + :math:`|\\cdot|` denotes the absolute value. + + Returns + ------- + tuple[DataArray, DataArray] + Score amplitudes of :math:`X` and :math:`Y`. + + """ + Rx, Ry = self._get_scores(normalized=normalized) + + Rx = self.whitener1.inverse_transform_scores(Rx) + Ry = self.whitener2.inverse_transform_scores(Ry) + + Rx = abs(Rx) + Ry = abs(Ry) + + Rx.name = "scores_amplitude_X" + Ry.name = "scores_amplitude_Y" + + Rx = self.preprocessor1.inverse_transform_scores(Rx) + Ry = self.preprocessor2.inverse_transform_scores(Ry) + + return Rx, Ry + + def scores_phase(self, normalized=False) -> Tuple[DataArray, DataArray]: + """Get the phase of the scores. + + The phases of the scores are defined as + + .. math:: + \\phi_{x, ij} = \\arg(r_{x, ij}) + .. math:: + \\phi_{y, ij} = \\arg(r_{y, ij}) + + where :math:`r_{ij}` is the :math:`i`-th entry of the :math:`j`-th score and + :math:`\\arg(\\cdot)` denotes the argument of a complex number. + + Returns + ------- + tuple[DataArray, DataArray] + Score phases of :math:`X` and :math:`Y`. + + """ + Rx, Ry = self._get_scores(normalized=normalized) + + Rx = self.whitener1.inverse_transform_scores(Rx) + Ry = self.whitener2.inverse_transform_scores(Ry) + + Rx = xr.apply_ufunc(np.angle, Rx, keep_attrs=True) + Ry = xr.apply_ufunc(np.angle, Ry, keep_attrs=True) + + Rx.name = "scores_phase_X" + Ry.name = "scores_phase_Y" + + Rx = self.preprocessor1.inverse_transform_scores(Rx) + Ry = self.preprocessor2.inverse_transform_scores(Ry) + + return Rx, Ry + + +class HilbertCPCCA(ComplexCPCCA): + """Hilbert CPCCA. + + Hilbert CPCCA extends classical CPCCA [1]_ by examining + amplitude-phase relationships. It augments the input data with its Hilbert + transform, creating a complex-valued field. + + This method solves the following optimization problem: + + :math:`\\max_{q_x, q_y} \\left( q_x^H X^H Y q_y \\right)` + + subject to the constraints: + + :math:`q_x^H (X^HX)^{1-\\alpha_x} q_x = 1, \\quad q_y^H + (Y^HY)^{1-\\alpha_y} q_y = 1` + + where :math:`H` denotes the conjugate transpose, :math:`X` and :math:`Y` are + the augmented data matrices, and :math:`\\alpha_x` and :math:`\\alpha_y` + control the degree of whitening applied to the data. + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + alpha : Sequence[float] | float, default=0.2 + Degree of whitening applied to the data. If float, the same value is + applied to both data sets. + padding : Sequence[str] | str | None, default="exp" + Padding method for the Hilbert transform. Available options are: - None: + no padding - "exp": exponential decay + decay_factor : Sequence[float] | float, default=0.2 + Decay factor for the exponential padding. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. + compute : bool, default=True + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. + + Examples + -------- + + Perform Hilbert CPCCA on two real-valued datasets `X` and `Y`, using + exponential padding: + + >>> model = HilbertCPCCA(n_modes=5, padding="exp") + >>> model.fit(X, Y) + + References + ---------- + .. [1] Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + """ + + def __init__( + self, + n_modes: int = 2, + alpha: Sequence[float] | float = 0.2, + padding: Sequence[str] | str | None = "exp", + decay_factor: Sequence[float] | float = 0.2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + compute: bool = True, + verbose: bool = True, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + ): + ComplexCPCCA.__init__( + self, + n_modes=n_modes, + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + use_pca=use_pca, + n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, + alpha=alpha, + compute=compute, + verbose=verbose, + sample_name=sample_name, + feature_name=feature_name, + solver=solver, + random_state=random_state, + solver_kwargs=solver_kwargs, + ) + self.attrs.update({"model": "Hilbert CPCCA"}) + + padding = self._process_parameter("padding", padding, "epx") + decay_factor = self._process_parameter("decay_factor", decay_factor, 0.2) + self._params["padding"] = padding + self._params["decay_factor"] = decay_factor + + def _fit_algorithm(self, X: DataArray, Y: DataArray) -> Self: + CPCCA._fit_algorithm(self, X, Y) + return self + + def transform( + self, X: DataObject | None = None, Y: DataObject | None = None, normalized=False + ) -> Sequence[DataArray]: + """Transform the input data into the component space.""" + raise NotImplementedError("Hilbert models do not support the transform method.") + + def _augment_data(self, X: DataArray, Y: DataArray) -> tuple[DataArray, DataArray]: + """Augment the data with the Hilbert transform.""" + params = self.get_params() + padding = params["padding"] + decay_factor = params["decay_factor"] + X = hilbert_transform( + X, + dims=(self.sample_name, self.feature_name[0]), + padding=padding[0], + decay_factor=decay_factor[0], + ) + Y = hilbert_transform( + Y, + dims=(self.sample_name, self.feature_name[1]), + padding=padding[1], + decay_factor=decay_factor[1], + ) + return X, Y diff --git a/xeofs/models/cpcca_rotator.py b/xeofs/models/cpcca_rotator.py new file mode 100644 index 0000000..9113b50 --- /dev/null +++ b/xeofs/models/cpcca_rotator.py @@ -0,0 +1,594 @@ +from typing import Dict, List, Sequence + +import numpy as np +import xarray as xr +from typing_extensions import Self + +from ..data_container import DataContainer +from ..preprocessing.preprocessor import Preprocessor +from ..preprocessing.whitener import Whitener +from ..utils.data_types import DataArray, DataObject +from ..utils.rotation import promax +from ..utils.xarray_utils import argsort_dask, get_deterministic_sign_multiplier +from ._base_model import _BaseModel +from .cpcca import CPCCA, ComplexCPCCA, HilbertCPCCA + + +class CPCCARotator(CPCCA): + """Rotate a solution obtained from ``xe.models.CPCCA``. + + Rotate the obtained components and scores of a CPCCA model to increase + interpretability. The algorithm here is based on the approach of Cheng & + Dunkerton (1995) [1]_ and adapted to the CPCCA framework [2]_. + + Parameters + ---------- + n_modes : int, default=10 + Specify the number of modes to be rotated. + power : int, default=1 + Set the power for the Promax rotation. A ``power`` value of 1 results in + a Varimax rotation. + max_iter : int or None, default=None + Determine the maximum number of iterations for the computation of the + rotation matrix. If not specified, defaults to 1000 if ``compute=True`` + and 100 if ``compute=False``, since we can't terminate a lazy + computation based using ``rtol``. + rtol : float, default=1e-8 + Define the relative tolerance required to achieve convergence and + terminate the iterative process. + compute : bool, default=True + Whether to compute the rotation immediately. + + References + ---------- + .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). + .. [2] Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + Examples + -------- + + Perform a CPCCA analysis: + + >>> model = CPCCA(n_modes=10) + >>> model.fit(X, Y, dim='time') + + Then, apply varimax rotation to first 5 components and scores: + + >>> rotator = CPCCARotator(n_modes=5) + >>> rotator.fit(model) + + Retrieve the rotated components and scores: + + >>> rotator.components() + >>> rotator.scores() + + """ + + def __init__( + self, + n_modes: int = 10, + power: int = 1, + max_iter: int | None = None, + rtol: float = 1e-8, + compute: bool = True, + ): + _BaseModel.__init__(self) + + if max_iter is None: + max_iter = 1000 if compute else 100 + + # Define model parameters + self._params = { + "n_modes": n_modes, + "power": power, + "max_iter": max_iter, + "rtol": rtol, + "compute": compute, + } + + # Define analysis-relevant meta data + self.attrs.update({"model": "Rotated CPCCA"}) + self.attrs.update(self._params) + + # Attach empty objects + self.preprocessor1 = Preprocessor() + self.preprocessor2 = Preprocessor() + self.whitener1 = Whitener() + self.whitener2 = Whitener() + self.data = DataContainer() + self.model = CPCCA() + + self.sorted = False + + def get_serialization_attrs(self) -> Dict: + return dict( + data=self.data, + preprocessor1=self.preprocessor1, + preprocessor2=self.preprocessor2, + whitener1=self.whitener1, + whitener2=self.whitener2, + model=self.model, + sorted=self.sorted, + ) + + def _fit_algorithm(self, model) -> Self: + self.model = model + self.preprocessor1 = model.preprocessor1 + self.preprocessor2 = model.preprocessor2 + self.whitener1 = model.whitener1 + self.whitener2 = model.whitener2 + self.sample_name = self.model.sample_name + self.feature_name = self.model.feature_name + self.sorted = False + + common_feature_dim = "common_feature_dim" + feature_name = self._get_feature_name() + + n_modes = self._params["n_modes"] + power = self._params["power"] + max_iter = self._params["max_iter"] + rtol = self._params["rtol"] + + # Construct the combined vector of loadings + # NOTE: In the methodology + # used by Cheng & Dunkerton (CD95), the combined vectors are "loaded" or + # weighted with the square root of the singular values, akin to what is + # done in standard Varimax rotation. While this approach ensures that + # the resulting projections are still uncorrelated as in the unrotated + # solution, it does not conserve the squared covariance fraction, i.e. + # the amount of explained squared covariance can be different before and + # after rotation. The authors then introduced a so-called "covariance + # fraction" which is conserved under rotation, but does not have a clear + # interpretation as the term covariance fraction is only correct when + # both data sets X and Y are equal and MCA reduces to PCA. + svalues = self.model.data["singular_values"].sel(mode=slice(1, n_modes)) + scaling = np.sqrt(svalues) + + # Get unrotated singular vectors + Qx = self.model.data["components1"].sel(mode=slice(1, n_modes)) + Qy = self.model.data["components2"].sel(mode=slice(1, n_modes)) + + # Unwhiten and back-transform into physical space + Qx = self.whitener1.inverse_transform_components(Qx) + Qy = self.whitener2.inverse_transform_components(Qy) + + # Rename the feature dimension to a common name so that the combined vectors can be concatenated + Qx = Qx.rename({feature_name[0]: common_feature_dim}) + Qy = Qy.rename({feature_name[1]: common_feature_dim}) + + loadings = xr.concat([Qx, Qy], dim=common_feature_dim) * scaling + + # Rotate loadings + promax_kwargs = {"power": power, "max_iter": max_iter, "rtol": rtol} + rot_loadings, rot_matrix, phi_matrix = promax( + loadings=loadings, + feature_dim=common_feature_dim, + compute=self._params["compute"], + **promax_kwargs, + ) + + # Assign coordinates to the rotation/correlation matrices + rot_matrix = rot_matrix.assign_coords( + mode_m=np.arange(1, rot_matrix.mode_m.size + 1), + mode_n=np.arange(1, rot_matrix.mode_n.size + 1), + ) + phi_matrix = phi_matrix.assign_coords( + mode_m=np.arange(1, phi_matrix.mode_m.size + 1), + mode_n=np.arange(1, phi_matrix.mode_n.size + 1), + ) + + # Rotated (loaded) singular vectors + Qx_rot = rot_loadings.isel( + {common_feature_dim: slice(0, Qx.coords[common_feature_dim].size)} + ) + Qy_rot = rot_loadings.isel( + {common_feature_dim: slice(Qx.coords[common_feature_dim].size, None)} + ) + + # Rename the common feature dimension to the original feature names + Qx_rot = Qx_rot.rename({common_feature_dim: feature_name[0]}) + Qy_rot = Qy_rot.rename({common_feature_dim: feature_name[1]}) + + # For consistency with the unrotated model classes, we transform the pattern vectors + # into the whitened PC space + Qx_rot = self.whitener1.transform_components(Qx_rot) + Qy_rot = self.whitener2.transform_components(Qy_rot) + + # Normalization factor of singular vectors + norm1_rot = xr.apply_ufunc( + np.linalg.norm, + Qx_rot, + input_core_dims=[[feature_name[0], "mode"]], + output_core_dims=[["mode"]], + exclude_dims={feature_name[0]}, + kwargs={"axis": 0}, + vectorize=False, + dask="allowed", + ) + norm2_rot = xr.apply_ufunc( + np.linalg.norm, + Qy_rot, + input_core_dims=[[feature_name[1], "mode"]], + output_core_dims=[["mode"]], + exclude_dims={feature_name[1]}, + kwargs={"axis": 0}, + vectorize=False, + dask="allowed", + ) + + # Rotated (normalized) singular vectors + Qx_rot = Qx_rot / norm1_rot + Qy_rot = Qy_rot / norm2_rot + + # CD95 call the quantity "norm1 * norm2" the "explained covariance" + explained_covariance = norm1_rot * norm2_rot + squared_covariance = explained_covariance**2 + + # Reorder according to squared covariance + idx_modes_sorted = argsort_dask(squared_covariance, "mode")[::-1] # type: ignore + idx_modes_sorted.coords.update(squared_covariance.coords) + + # Rotate scores using rotation matrix + scores1 = self.model.data["scores1"].sel(mode=slice(1, n_modes)) + scores2 = self.model.data["scores2"].sel(mode=slice(1, n_modes)) + + scores1 = self.whitener1.inverse_transform_scores(scores1) + scores2 = self.whitener2.inverse_transform_scores(scores2) + + # Normalize scores + scores1 = scores1 / scaling + scores2 = scores2 / scaling + + RinvT = self._compute_rot_mat_inv_trans( + rot_matrix, input_dims=("mode_m", "mode_n") + ) + # Rename dimension mode to ensure that dot product has dimensions (sample x mode) as output + scores1 = scores1.rename({"mode": "mode_m"}) + scores2 = scores2.rename({"mode": "mode_m"}) + RinvT = RinvT.rename({"mode_n": "mode"}) + scores1_rot = xr.dot(scores1, RinvT, dims="mode_m") * norm1_rot + scores2_rot = xr.dot(scores2, RinvT, dims="mode_m") * norm2_rot + + # Ensure consitent signs for deterministic output + modes_sign = get_deterministic_sign_multiplier(rot_loadings, common_feature_dim) + Qx_rot = Qx_rot * modes_sign + Qy_rot = Qy_rot * modes_sign + scores1_rot = scores1_rot * modes_sign + scores2_rot = scores2_rot * modes_sign + + # Create data container + self.data.add( + name="input_data1", data=self.model.data["input_data1"], allow_compute=False + ) + self.data.add( + name="input_data2", data=self.model.data["input_data2"], allow_compute=False + ) + self.data.add(name="components1", data=Qx_rot) + self.data.add(name="components2", data=Qy_rot) + self.data.add(name="scores1", data=scores1_rot) + self.data.add(name="scores2", data=scores2_rot) + self.data.add(name="squared_covariance", data=squared_covariance) + self.data.add( + name="total_squared_covariance", + data=self.model.data["total_squared_covariance"], + ) + + self.data.add(name="idx_modes_sorted", data=idx_modes_sorted) + self.data.add(name="norm1", data=norm1_rot) + self.data.add(name="norm2", data=norm2_rot) + self.data.add(name="rotation_matrix", data=rot_matrix) + self.data.add(name="phi_matrix", data=phi_matrix) + self.data.add(name="modes_sign", data=modes_sign) + + # Assign analysis-relevant meta data + self.data.set_attrs(self.attrs) + + return self + + def fit(self, model: CPCCA) -> Self: + """Rotate the solution obtained from ``xe.models.CPCCA``. + + Parameters + ---------- + model : ``xe.models.CPCCA`` + The CPCCA model to be rotated. + + """ + self._fit_algorithm(model) + + if self._params["compute"]: + self.compute() + + return self + + def transform( + self, + X: DataObject | None = None, + Y: DataObject | None = None, + normalized: bool = False, + ) -> DataArray | List[DataArray]: + """Project new "unseen" data onto the rotated singular vectors. + + Parameters + ---------- + X : DataObject + Data to be projected onto the rotated singular vectors of the first dataset. + Y : DataObject + Data to be projected onto the rotated singular vectors of the second dataset. + + Returns + ------- + DataArray | List[DataArray] + Projected data. + + """ + # raise error if no data is provided + if X is None and Y is None: + raise ValueError("No data provided. Please provide X and/or Y.") + + n_modes = self._params["n_modes"] + rot_matrix = self.data["rotation_matrix"] + RinvT = self._compute_rot_mat_inv_trans( + rot_matrix, input_dims=("mode_m", "mode_n") + ) + RinvT = RinvT.rename({"mode_n": "mode"}) + + scaling = self.model.data["singular_values"].sel(mode=slice(1, n_modes)) + scaling = np.sqrt(scaling) + + results = [] + + if X is not None: + # Select the (non-rotated) singular vectors of the first dataset + comps1 = self.model.data["components1"].sel(mode=slice(1, n_modes)) + + # Preprocess the data + comps1 = self.whitener1.inverse_transform_components(comps1) + X = self.preprocessor1.transform(X) + + # Compute non-rotated scores by projecting the data onto non-rotated components + projections1 = xr.dot(X, comps1) / scaling + # Rotate the scores + projections1 = projections1.rename({"mode": "mode_m"}) + projections1 = xr.dot(projections1, RinvT, dims="mode_m") + # Reorder according to variance + if self.sorted: + projections1 = projections1.isel( + mode=self.data["idx_modes_sorted"].values + ).assign_coords(mode=projections1.mode) + # Adapt the sign of the scores + projections1 = projections1 * self.data["modes_sign"] + + # Unscale the scores + if not normalized: + projections1 = projections1 * self.data["norm1"] + + # Unstack the projections + projections1 = self.preprocessor1.inverse_transform_scores(projections1) + + results.append(projections1) + + if Y is not None: + # Select the (non-rotated) singular vectors of the second dataset + comps2 = self.model.data["components2"].sel(mode=slice(1, n_modes)) + + # Preprocess the data + comps2 = self.whitener2.inverse_transform_components(comps2) + Y = self.preprocessor2.transform(Y) + + # Compute non-rotated scores by project the data onto non-rotated components + projections2 = xr.dot(Y, comps2) / scaling + # Rotate the scores + projections2 = projections2.rename({"mode": "mode_m"}) + projections2 = xr.dot(projections2, RinvT, dims="mode_m") + # Reorder according to variance + if self.sorted: + projections2 = projections2.isel( + mode=self.data["idx_modes_sorted"].values + ).assign_coords(mode=projections2.mode) + # Determine the sign of the scores + projections2 = projections2 * self.data["modes_sign"] + + # Unscale the scores + if not normalized: + projections2 = projections2 * self.data["norm2"] + + # Unstack the projections + projections2 = self.preprocessor2.inverse_transform_scores(projections2) + + results.append(projections2) + + if len(results) == 0: + raise ValueError("provide at least one of [`X`, `Y`]") + elif len(results) == 1: + return results[0] + else: + return results + + def _post_compute(self): + """Leave sorting until after compute because it can't be done lazily.""" + self._sort_by_variance() + + def _sort_by_variance(self): + """Re-sort the mode dimension of all data variables by variance explained.""" + if not self.sorted: + for key in self.data.keys(): + if "mode" in self.data[key].dims and key != "idx_modes_sorted": + self.data[key] = ( + self.data[key] + .isel(mode=self.data["idx_modes_sorted"].values) + .assign_coords(mode=self.data[key].mode) + ) + self.sorted = True + + def _compute_rot_mat_inv_trans(self, rotation_matrix, input_dims) -> xr.DataArray: + """Compute the inverse transpose of the rotation matrix. + + For orthogonal rotations (e.g., Varimax), the inverse transpose is equivalent + to the rotation matrix itself. For oblique rotations (e.g., Promax), the simplification + does not hold. + + Returns + ------- + rotation_matrix : xr.DataArray + + """ + if self._params["power"] > 1: + # inverse matrix + rotation_matrix = xr.apply_ufunc( + np.linalg.inv, + rotation_matrix, + input_core_dims=[(input_dims)], + output_core_dims=[(input_dims[::-1])], + vectorize=False, + dask="allowed", + ) + # transpose matrix + rotation_matrix = rotation_matrix.conj().transpose(*input_dims) + return rotation_matrix + + def _get_feature_name(self): + return self.model.feature_name + + +class ComplexCPCCARotator(CPCCARotator, ComplexCPCCA): + """Rotate a solution obtained from ``xe.models.ComplexCPCCA``. + + Rotate the obtained components and scores of a CPCCA model to increase + interpretability. The algorithm here is based on the approach of Cheng & + Dunkerton (1995) [1]_ and adapted to the CPCCA framework [2]_. + + + Parameters + ---------- + n_modes : int, default=10 + Specify the number of modes to be rotated. + power : int, default=1 + Set the power for the Promax rotation. A ``power`` value of 1 results in + a Varimax rotation. + max_iter : int, default=1000 + Determine the maximum number of iterations for the computation of the + rotation matrix. + rtol : float, default=1e-8 + Define the relative tolerance required to achieve convergence and + terminate the iterative process. + squared_loadings : bool, default=False + Specify the method for constructing the combined vectors of loadings. If + True, the combined vectors are loaded with the singular values (termed + "squared loadings"), conserving the squared covariance under rotation. + This allows estimation of mode importance after rotation. If False, the + combined vectors are loaded with the square root of the singular values, + following the method described by Cheng & Dunkerton. + compute: bool, default=True + Whether to compute the rotation immediately. + + References + ---------- + .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). + .. [3] Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + Examples + -------- + + Perform a CPCCA analysis: + + >>> model = ComplexCPCCA(n_modes=10) + >>> model.fit(X, Y, dim='time') + + Then, apply varimax rotation to first 5 components and scores: + + >>> rotator = ComplexCPCCARotator(n_modes=5) + >>> rotator.fit(model) + + Retrieve the rotated components and scores: + + >>> rotator.components() + >>> rotator.scores() + + """ + + def __init__(self, **kwargs): + CPCCARotator.__init__(self, **kwargs) + self.attrs.update({"model": "Rotated Complex CPCCA"}) + self.model = ComplexCPCCA() + + +class HilbertCPCCARotator(ComplexCPCCARotator, HilbertCPCCA): + """Rotate a solution obtained from ``xe.models.HilbertCPCCA``. + + Rotate the obtained components and scores of a CPCCA model to increase + interpretability. The algorithm here is based on the approach of Cheng & + Dunkerton (1995) [1]_ and adapted to the CPCCA framework [2]_. + + + Parameters + ---------- + n_modes : int, default=10 + Specify the number of modes to be rotated. + power : int, default=1 + Set the power for the Promax rotation. A ``power`` value of 1 results in + a Varimax rotation. + max_iter : int, default=1000 + Determine the maximum number of iterations for the computation of the + rotation matrix. + rtol : float, default=1e-8 + Define the relative tolerance required to achieve convergence and + terminate the iterative process. + squared_loadings : bool, default=False + Specify the method for constructing the combined vectors of loadings. If + True, the combined vectors are loaded with the singular values (termed + "squared loadings"), conserving the squared covariance under rotation. + This allows estimation of mode importance after rotation. If False, the + combined vectors are loaded with the square root of the singular values, + following the method described by Cheng & Dunkerton. + compute: bool, default=True + Whether to compute the rotation immediately. + + References + ---------- + .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). + .. [3] Swenson, E. Continuum Power CCA: A Unified Approach for Isolating + Coupled Modes. Journal of Climate 28, 1016–1030 (2015). + + Examples + -------- + + Perform a CPCCA analysis: + + >>> model = HilbertCPCCA(n_modes=10) + >>> model.fit(X, Y, dim='time') + + Then, apply varimax rotation to first 5 components and scores: + + >>> rotator = HilbertCPCCARotator(n_modes=5) + >>> rotator.fit(model) + + Retrieve the rotated components and scores: + + >>> rotator.components() + >>> rotator.scores() + + """ + + def __init__(self, **kwargs): + ComplexCPCCARotator.__init__(self, **kwargs) + self.attrs.update({"model": "Rotated Hilbert CPCCA"}) + self.model = HilbertCPCCA() + + def transform( + self, X: DataObject | None = None, Y: DataObject | None = None, normalized=False + ) -> Sequence[DataArray]: + """Transform the data.""" + # Here we make use of the Method Resolution Order (MRO) to call the + # transform method of the first class in the MRO after `CPCCARotator` + # that has a transform method. In this case it will be `HilbertCPCCA`, + # which will raise an error because it does not have a transform method. + return super(CPCCARotator, self).transform(X, Y, normalized) diff --git a/xeofs/models/decomposer.py b/xeofs/models/decomposer.py index 27ecda6..0b91f9c 100644 --- a/xeofs/models/decomposer.py +++ b/xeofs/models/decomposer.py @@ -1,5 +1,4 @@ import warnings -from typing import Optional import dask import numpy as np @@ -37,7 +36,7 @@ class Decomposer: than 80% of the smallest dimension of the data, then the more efficient `randomized` method is enabled. Otherwise the exact full SVD is computed and optionally truncated afterwards. - random_state : Optional[int], default=None + random_state : np.random.Generator | int | None, default=None Seed for the random number generator. verbose: bool, default=False Whether to show a progress bar when computing the decomposition. @@ -54,7 +53,7 @@ def __init__( flip_signs: bool = True, compute: bool = True, solver: str = "auto", - random_state: Optional[int] = None, + random_state: np.random.Generator | int | None = None, verbose: bool = False, component_dim_name: str = "mode", solver_kwargs: dict = {}, diff --git a/xeofs/models/eof.py b/xeofs/models/eof.py index b1ec5c4..275d9c8 100644 --- a/xeofs/models/eof.py +++ b/xeofs/models/eof.py @@ -8,11 +8,11 @@ from ..utils.data_types import DataArray, DataObject from ..utils.hilbert_transform import hilbert_transform from ..utils.xarray_utils import total_variance as compute_total_variance -from ._base_model import _BaseModel +from ._base_model_single_set import _BaseModelSingleSet from .decomposer import Decomposer -class EOF(_BaseModel): +class EOF(_BaseModelSingleSet): """EOF analysis. Empirical Orthogonal Functions (EOF) analysis, more commonly known diff --git a/xeofs/models/gwpca.py b/xeofs/models/gwpca.py index f9813ce..ae99ccd 100644 --- a/xeofs/models/gwpca.py +++ b/xeofs/models/gwpca.py @@ -1,23 +1,22 @@ +import numpy as np +import xarray as xr from typing_extensions import Self - from xeofs.utils.data_types import DataArray -from ._base_model import _BaseModel -from ..utils.sanity_checks import assert_not_complex + from ..utils.constants import ( VALID_CARTESIAN_X_NAMES, VALID_CARTESIAN_Y_NAMES, VALID_LATITUDE_NAMES, VALID_LONGITUDE_NAMES, ) -import numpy as np -import xarray as xr - from ..utils.distance_metrics import VALID_METRICS from ..utils.kernels import VALID_KERNELS +from ..utils.sanity_checks import assert_not_complex +from ._base_model_single_set import _BaseModelSingleSet -class GWPCA(_BaseModel): +class GWPCA(_BaseModelSingleSet): """Geographically weighted PCA. Geographically weighted PCA (GWPCA) [1]_ uses a geographically weighted approach to perform PCA for diff --git a/xeofs/models/mca.py b/xeofs/models/mca.py index 046a132..221361e 100644 --- a/xeofs/models/mca.py +++ b/xeofs/models/mca.py @@ -1,361 +1,133 @@ import warnings -from typing import Dict, Optional, Sequence, Tuple +from typing import Sequence import numpy as np -import xarray as xr -from typing_extensions import Self -from ..utils.data_types import DataArray, DataObject -from ..utils.dimension_renamer import DimensionRenamer -from ..utils.hilbert_transform import hilbert_transform -from ..utils.sanity_checks import assert_not_complex -from ..utils.statistics import pearson_correlation -from ..utils.xarray_utils import argsort_dask -from ._base_cross_model import _BaseCrossModel -from .decomposer import Decomposer +from ..utils.data_types import DataArray +from .cpcca import CPCCA, ComplexCPCCA, HilbertCPCCA -class MCA(_BaseCrossModel): - """Maximum Covariance Analyis. +class MCA(CPCCA): + """Maximum Covariance Analysis (MCA). - MCA is a statistical method that finds patterns of maximum covariance between two datasets. + MCA seeks to find paris of coupled patterns that maximize the squared + covariance [1]_ [2]_ . + + This method solves the following optimization problem: + + :math:`\\max_{q_x, q_y} \\left( q_x^T X^T Y q_y \\right)` + + subject to the constraints: + + :math:`q_x^T q_x = 1, \\quad q_y^T q_y = 1` + + where :math:`X` and :math:`Y` are the input data matrices and :math:`q_x` + and :math:`q_y` are the corresponding pattern vectors. Parameters ---------- - n_modes: int, default=2 + n_modes : int, default=2 Number of modes to calculate. - center: bool, default=True - Whether to center the input data. - standardize: bool, default=False - Whether to standardize the input data. - use_coslat: bool, default=False - Whether to use cosine of latitude for scaling. - n_pca_modes: int, default=None - The number of principal components to retain during the PCA preprocessing - step applied to both data sets prior to executing MCA. - If set to None, PCA preprocessing will be bypassed, and the MCA will be performed on the original datasets. - Specifying an integer value greater than 0 for `n_pca_modes` will trigger the PCA preprocessing, retaining - only the specified number of principal components. This reduction in dimensionality can be especially beneficial - when dealing with high-dimensional data, where computing the cross-covariance matrix can become computationally - intensive or in scenarios where multicollinearity is a concern. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is then computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. compute : bool, default=True - Whether to compute elements of the model eagerly, or to defer computation. - If True, four pieces of the fit will be computed sequentially: 1) the - preprocessor scaler, 2) optional NaN checks, 3) SVD decomposition, 4) scores - and components. - sample_name: str, default="sample" - Name of the new sample dimension. - feature_name: str, default="feature" - Name of the new feature dimension. - solver: {"auto", "full", "randomized"}, default="auto" - Solver to use for the SVD computation. - random_state: int, default=None + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None Seed for the random number generator. - solver_kwargs: dict, default={} + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} Additional keyword arguments passed to the SVD solver function. - Notes - ----- - MCA is similar to Principal Component Analysis (PCA) and Canonical Correlation Analysis (CCA), - but while PCA finds modes of maximum variance and CCA finds modes of maximum correlation, - MCA finds modes of maximum covariance. See [1]_ [2]_ for more details. References ---------- - .. [1] Bretherton, C., Smith, C., Wallace, J., 1992. An intercomparison of methods for finding coupled patterns in climate data. Journal of climate 5, 541–560. - .. [2] Cherry, S., 1996. Singular value decomposition analysis and canonical correlation analysis. Journal of Climate 9, 2003–2009. + .. [1] Bretherton, C., Smith, C., Wallace, J., 1992. An intercomparison of + methods for finding coupled patterns in climate data. Journal of climate + 5, 541–560. + .. [2] Wilks, D. S. Statistical Methods in the Atmospheric Sciences. + (Academic Press, 2019). + doi:https://doi.org/10.1016/B978-0-12-815823-4.00011-0. Examples -------- - >>> model = MCA(n_modes=5, standardize=True) - >>> model.fit(data1, data2) + + Perform MCA on two datasets on a regular longitude-latitude grid: + + >>> model = MCA(n_modes=5, use_coslat=True) + >>> model.fit(X, Y, dim="time") """ def __init__( self, n_modes: int = 2, - center: bool = True, - standardize: bool = False, - use_coslat: bool = False, - check_nans: bool = True, - n_pca_modes: Optional[int] = None, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, compute: bool = True, + verbose: bool = False, sample_name: str = "sample", - feature_name: str = "feature", + feature_name: Sequence[str] | str = "feature", solver: str = "auto", - random_state: Optional[int] = None, - solver_kwargs: Dict = {}, - **kwargs, + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, ): - super().__init__( + CPCCA.__init__( + self, n_modes=n_modes, - center=center, + alpha=[1.0, 1.0], standardize=standardize, use_coslat=use_coslat, check_nans=check_nans, + use_pca=use_pca, n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, compute=compute, + verbose=verbose, sample_name=sample_name, feature_name=feature_name, solver=solver, random_state=random_state, solver_kwargs=solver_kwargs, - **kwargs, - ) - self.attrs.update({"model": "MCA"}) - - def _compute_cross_covariance_matrix(self, X1, X2): - """Compute the cross-covariance matrix of two data objects. - - Note: It is assumed that the data objects are centered. - - """ - sample_name = self.sample_name - n_samples = X1.coords[sample_name].size - if X1.coords[sample_name].size != X2.coords[sample_name].size: - err_msg = "The two data objects must have the same number of samples." - raise ValueError(err_msg) - - return xr.dot(X1.conj(), X2, dims=sample_name) / (n_samples - 1) - - def _augment_data(self, data, dims): - return data - - def _fit_algorithm( - self, - data1: DataArray, - data2: DataArray, - ) -> Self: - sample_name = self.sample_name - feature_name = self.feature_name - - # Initialize the SVD decomposer - decomposer = Decomposer(**self._decomposer_kwargs) - - # Perform SVD on PCA-reduced data - if (self.pca1 is not None) and (self.pca2 is not None): - # Fit the PCA models - self.pca1.fit(data1, dim=sample_name) - self.pca2.fit(data2, dim=sample_name) - # Get the PCA scores - pca_scores1 = self.pca1.data["scores"] * self.pca1.singular_values() - pca_scores2 = self.pca2.data["scores"] * self.pca2.singular_values() - - # Augment data - pca_scores1 = self._augment_data(pca_scores1, dims=(sample_name, "mode")) - pca_scores2 = self._augment_data(pca_scores2, dims=(sample_name, "mode")) - - # Compute the cross-covariance matrix of the PCA scores - pca_scores1 = pca_scores1.rename({"mode": "feature1"}) - pca_scores2 = pca_scores2.rename({"mode": "feature2"}) - cov_matrix = self._compute_cross_covariance_matrix(pca_scores1, pca_scores2) - - # Perform the SVD - decomposer.fit(cov_matrix, dims=("feature1", "feature2")) - V1 = decomposer.U_ # left singular vectors (feature1 x mode) - V2 = decomposer.V_ # right singular vectors (feature2 x mode) - - # left and right PCA eigenvectors (feature x mode) - V1pre = self.pca1.data["components"] - V2pre = self.pca2.data["components"] - - # Compute the singular vectors - V1pre = V1pre.rename({"mode": "feature1"}) - V2pre = V2pre.rename({"mode": "feature2"}) - singular_vectors1 = xr.dot(V1pre, V1, dims="feature1") - singular_vectors2 = xr.dot(V2pre, V2, dims="feature2") - - # Perform SVD directly on data - else: - # Augment data - data1 = self._augment_data(data1, dims=(sample_name, feature_name)) - data2 = self._augment_data(data2, dims=(sample_name, feature_name)) - - # Rename feature and associated dimensions of data objects to avoid index conflicts - dim_renamer1 = DimensionRenamer(feature_name, "1") - dim_renamer2 = DimensionRenamer(feature_name, "2") - data1_temp = dim_renamer1.fit_transform(data1) - data2_temp = dim_renamer2.fit_transform(data2) - # Compute the cross-covariance matrix - cov_matrix = self._compute_cross_covariance_matrix(data1_temp, data2_temp) - - # Perform the SVD - decomposer.fit(cov_matrix, dims=("feature1", "feature2")) - singular_vectors1 = decomposer.U_ - singular_vectors2 = decomposer.V_ - - # Rename the singular vectors - singular_vectors1 = dim_renamer1.inverse_transform(singular_vectors1) - singular_vectors2 = dim_renamer2.inverse_transform(singular_vectors2) - - # Store the results - singular_values = decomposer.s_ - - # Compute total squared variance - squared_covariance = singular_values**2 - total_squared_covariance = (abs(cov_matrix) ** 2).sum() - - norm1 = np.sqrt(singular_values) - norm2 = np.sqrt(singular_values) - - # Index of the sorted squared covariance - idx_sorted_modes = argsort_dask(squared_covariance, "mode")[::-1] - idx_sorted_modes.coords.update(squared_covariance.coords) - - # Project the data onto the singular vectors - scores1 = xr.dot(data1, singular_vectors1, dims=feature_name) / norm1 - scores2 = xr.dot(data2, singular_vectors2, dims=feature_name) / norm2 - - self.data.add(name="input_data1", data=data1, allow_compute=False) - self.data.add(name="input_data2", data=data2, allow_compute=False) - self.data.add(name="components1", data=singular_vectors1) - self.data.add(name="components2", data=singular_vectors2) - self.data.add(name="scores1", data=scores1) - self.data.add(name="scores2", data=scores2) - self.data.add(name="squared_covariance", data=squared_covariance) - self.data.add(name="total_squared_covariance", data=total_squared_covariance) - self.data.add(name="idx_modes_sorted", data=idx_sorted_modes) - self.data.add(name="norm1", data=norm1) - self.data.add(name="norm2", data=norm2) - - # Assign analysis-relevant meta data - self.data.set_attrs(self.attrs) - return self - - def transform( - self, data1: Optional[DataObject] = None, data2: Optional[DataObject] = None - ) -> Sequence[DataArray]: - """Get the expansion coefficients of "unseen" data. - - The expansion coefficients are obtained by projecting data onto the singular vectors. - - Parameters - ---------- - data1: DataArray | Dataset | List[DataArray] - Left input data. Must be provided if `data2` is not provided. - data2: DataArray | Dataset | List[DataArray] - Right input data. Must be provided if `data1` is not provided. - - Returns - ------- - scores1: DataArray | Dataset | List[DataArray] - Left scores. - scores2: DataArray | Dataset | List[DataArray] - Right scores. - - """ - return super().transform(data1, data2) - - def _transform_algorithm( - self, data1: Optional[DataArray] = None, data2: Optional[DataArray] = None - ) -> Dict[str, DataArray]: - results = {} - if data1 is not None: - # Project data onto singular vectors - comps1 = self.data["components1"] - norm1 = self.data["norm1"] - scores1 = xr.dot(data1, comps1) / norm1 - results["data1"] = scores1 - - if data2 is not None: - # Project data onto singular vectors - comps2 = self.data["components2"] - norm2 = self.data["norm2"] - scores2 = xr.dot(data2, comps2) / norm2 - results["data2"] = scores2 - - return results - - def _inverse_transform_algorithm( - self, scores1: DataArray, scores2: DataArray - ) -> Tuple[DataArray, DataArray]: - """Reconstruct the original data from transformed data. - - Parameters - ---------- - scores1: DataArray - Transformed left field data to be reconstructed. This could be - a subset of the `scores` data of a fitted model, or unseen data. - Must have a 'mode' dimension. - scores2: DataArray - Transformed right field data to be reconstructed. This could be - a subset of the `scores` data of a fitted model, or unseen data. - Must have a 'mode' dimension. - - Returns - ------- - Xrec1: DataArray - Reconstructed data of left field. - Xrec2: DataArray - Reconstructed data of right field. - - """ - # Singular vectors - comps1 = self.data["components1"].sel(mode=scores1.mode) - comps2 = self.data["components2"].sel(mode=scores2.mode) - - # Norms - norm1 = self.data["norm1"].sel(mode=scores1.mode) - norm2 = self.data["norm2"].sel(mode=scores2.mode) - - # Reconstruct the data - data1 = xr.dot(scores1, comps1.conj() * norm1, dims="mode") - data2 = xr.dot(scores2, comps2.conj() * norm2, dims="mode") - - return data1, data2 - - def squared_covariance(self): - """Get the squared covariance. - - The squared covariance corresponds to the explained variance in PCA and is given by the - squared singular values of the covariance matrix. - - """ - return self.data["squared_covariance"] - - def squared_covariance_fraction(self): - """Calculate the squared covariance fraction (SCF). - - The SCF is a measure of the proportion of the total squared covariance that is explained by each mode `i`. It is computed - as follows: - - .. math:: - SCF_i = \\frac{\\sigma_i^2}{\\sum_{i=1}^{m} \\sigma_i^2} - - where `m` is the total number of modes and :math:`\\sigma_i` is the `ith` singular value of the covariance matrix. - - """ - return self.data["squared_covariance"] / self.data["total_squared_covariance"] - - def singular_values(self): - """Get the singular values of the cross-covariance matrix.""" - singular_values = xr.apply_ufunc( - np.sqrt, - self.data["squared_covariance"], - dask="allowed", - vectorize=False, - keep_attrs=True, ) - singular_values.name = "singular_values" - return singular_values - - def total_covariance(self) -> DataArray: - """Get the total covariance. - - This measure follows the defintion of Cheng and Dunkerton (1995). - Note that this measure is not an invariant in MCA. - - """ - tot_cov = self.singular_values().sum() - tot_cov.attrs.update(self.singular_values().attrs) - tot_cov.name = "total_covariance" - return tot_cov + self.attrs.update({"model": "Maximum Covariance Analysis"}) + # Renove alpha from the inherited CPCCA serialization params because it is hard-coded for MCA + self._params.pop("alpha") - def covariance_fraction(self): + def covariance_fraction_CD95(self): """Get the covariance fraction (CF). - Cheng and Dunkerton (1995) define the CF as follows: + Cheng and Dunkerton (1995) [3]_ define the CF as follows: .. math:: CF_i = \\frac{\\sigma_i}{\\sum_{i=1}^{m} \\sigma_i} @@ -363,211 +135,96 @@ def covariance_fraction(self): where `m` is the total number of modes and :math:`\\sigma_i` is the `ith` singular value of the covariance matrix. - In this implementation the sum of singular values is estimated from - the first `n` modes, therefore one should aim to retain as many - modes as possible to get a good estimate of the covariance fraction. + This implementation estimates the sum of singular values from the first + `n` modes, therefore one should aim to retain as many modes as possible + to get a good estimate of the covariance fraction. Note ---- - It is important to differentiate the CF from the squared covariance fraction (SCF). While the SCF is an invariant quantity in MCA, the CF is not. - Therefore, the SCF is used to assess the relative importance of each mode. Cheng and Dunkerton (1995) introduced the CF in the context of - Varimax-rotated MCA to compare the relative importance of each mode before and after rotation. In the special case of both data fields in MCA being identical, - the CF is equivalent to the explained variance ratio in EOF analysis. + In MCA, the focus is on maximizing the *squared* covariance (SC). As a + result, this quantity is preserved during decomposition - meaning the SC + of both datasets remains unchanged before and after decomposition. Each + mode explains a fraction of the total SC, and together, all modes can + reconstruct the total SC of the cross-covariance matrix. However, the + (non-squared) covariance is not invariant in MCA; it is not preserved by + the individual modes and cannot be reconstructed from them. + Consequently, the squared covariance fraction (SCF) is invariant in MCA + and is typically used to assess the relative importance of each mode. In + contrast, the convariance fraction (CF) is not invariant. Cheng and + Dunkerton [3]_ introduced the CF to compare the relative importance of + modes before and after Varimax rotation in MCA. Notably, when the data + fields in MCA are identical, the CF corresponds to the explained + variance ratio in Principal Component Analysis (PCA). + + References + ---------- + .. [3] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial + Patterns Derived from Singular Value Decomposition Analysis. J. + Climate 8, 2631–2643 (1995). """ # Check how sensitive the CF is to the number of modes - svals = self.singular_values() - tot_var = self.total_covariance() - cf = svals[0] / svals.cumsum() + cov_exp = self._covariance_explained_DC95() + tot_var = self._total_covariance() + cf = cov_exp[0] / cov_exp.cumsum() change_per_mode = cf.shift({"mode": 1}) - cf change_in_cf_in_last_mode = change_per_mode.isel(mode=-1) if change_in_cf_in_last_mode > 0.001: - print( - "Warning: CF is sensitive to the number of modes retained. Please increase `n_modes` for a better estimate." + warnings.warn( + "The curent estimate of CF is sensitive to the number of modes retained. Please increase `n_modes` for a better estimate." ) - cov_frac = svals / tot_var + cov_frac = cov_exp / tot_var cov_frac.name = "covariance_fraction" - cov_frac.attrs.update(svals.attrs) + cov_frac.attrs.update(cov_exp.attrs) return cov_frac - def components(self): - """Return the singular vectors of the left and right field. - - Returns - ------- - components1: DataArray | Dataset | List[DataArray] - Left components of the fitted model. - components2: DataArray | Dataset | List[DataArray] - Right components of the fitted model. - - """ - return super().components() - - def scores(self): - """Return the scores of the left and right field. - - The scores in MCA are the projection of the left and right field onto the - left and right singular vector of the cross-covariance matrix. - - Returns - ------- - scores1: DataArray - Left scores. - scores2: DataArray - Right scores. - - """ - return super().scores() - - def homogeneous_patterns(self, correction=None, alpha=0.05): - """Return the homogeneous patterns of the left and right field. - - The homogeneous patterns are the correlation coefficients between the - input data and the scores. + def _squared_covariance(self) -> DataArray: + """Get the squared covariance. - More precisely, the homogeneous patterns `r_{hom}` are defined as + The squared covariance is given by the squared singular values of the + covariance matrix: .. math:: - r_{hom, x} = corr \\left(X, A_x \\right) - .. math:: - r_{hom, y} = corr \\left(Y, A_y \\right) + SC_i = \\sigma_i^2 - where :math:`X` and :math:`Y` are the input data, :math:`A_x` and :math:`A_y` - are the scores of the left and right field, respectively. - - Parameters - ---------- - correction: str, default=None - Method to apply a multiple testing correction. If None, no correction - is applied. Available methods are: - - bonferroni : one-step correction - - sidak : one-step correction - - holm-sidak : step down method using Sidak adjustments - - holm : step-down method using Bonferroni adjustments - - simes-hochberg : step-up method (independent) - - hommel : closed method based on Simes tests (non-negative) - - fdr_bh : Benjamini/Hochberg (non-negative) (default) - - fdr_by : Benjamini/Yekutieli (negative) - - fdr_tsbh : two stage fdr correction (non-negative) - - fdr_tsbky : two stage fdr correction (non-negative) - alpha: float, default=0.05 - The desired family-wise error rate. Not used if `correction` is None. - - Returns - ------- - patterns1: DataArray | Dataset | List[DataArray] - Left homogenous patterns. - patterns2: DataArray | Dataset | List[DataArray] - Right homogenous patterns. - pvals1: DataArray | Dataset | List[DataArray] - Left p-values. - pvals2: DataArray | Dataset | List[DataArray] - Right p-values. + where :math:`\\sigma_i` is the `ith` singular value of the covariance + matrix. """ - input_data1 = self.data["input_data1"] - input_data2 = self.data["input_data2"] - - scores1 = self.data["scores1"] - scores2 = self.data["scores2"] - - hom_pat1, pvals1 = pearson_correlation( - input_data1, scores1, correction=correction, alpha=alpha - ) - hom_pat2, pvals2 = pearson_correlation( - input_data2, scores2, correction=correction, alpha=alpha - ) - - hom_pat1.name = "left_homogeneous_patterns" - hom_pat2.name = "right_homogeneous_patterns" + # only true for MCA, for alpha < 1 the sigmas become more and more correlation coefficients + # either remove this one and provide it only for MCA child class, or use error formulation + sc = self.data["squared_covariance"] + sc.name = "squared_covariance" + return sc - pvals1.name = "pvalues_of_left_homogeneous_patterns" - pvals2.name = "pvalues_of_right_homogeneous_patterns" - - hom_pat1 = self.preprocessor1.inverse_transform_components(hom_pat1) - hom_pat2 = self.preprocessor2.inverse_transform_components(hom_pat2) - - pvals1 = self.preprocessor1.inverse_transform_components(pvals1) - pvals2 = self.preprocessor2.inverse_transform_components(pvals2) - - return (hom_pat1, hom_pat2), (pvals1, pvals2) - - def heterogeneous_patterns(self, correction=None, alpha=0.05): - """Return the heterogeneous patterns of the left and right field. - - The heterogeneous patterns are the correlation coefficients between the - input data and the scores of the other field. - - More precisely, the heterogeneous patterns `r_{het}` are defined as - - .. math:: - r_{het, x} = corr \\left(X, A_y \\right) - .. math:: - r_{het, y} = corr \\left(Y, A_x \\right) + def _covariance_explained_DC95(self) -> DataArray: + """Get the covariance explained (CE) per mode according to CD95. - where :math:`X` and :math:`Y` are the input data, :math:`A_x` and :math:`A_y` - are the scores of the left and right field, respectively. - - Parameters + References ---------- - correction: str, default=None - Method to apply a multiple testing correction. If None, no correction - is applied. Available methods are: - - bonferroni : one-step correction - - sidak : one-step correction - - holm-sidak : step down method using Sidak adjustments - - holm : step-down method using Bonferroni adjustments - - simes-hochberg : step-up method (independent) - - hommel : closed method based on Simes tests (non-negative) - - fdr_bh : Benjamini/Hochberg (non-negative) (default) - - fdr_by : Benjamini/Yekutieli (negative) - - fdr_tsbh : two stage fdr correction (non-negative) - - fdr_tsbky : two stage fdr correction (non-negative) - alpha: float, default=0.05 - The desired family-wise error rate. Not used if `correction` is None. + Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). """ - input_data1 = self.data["input_data1"] - input_data2 = self.data["input_data2"] - - scores1 = self.data["scores1"] - scores2 = self.data["scores2"] - - patterns1, pvals1 = pearson_correlation( - input_data1, scores2, correction=correction, alpha=alpha - ) - patterns2, pvals2 = pearson_correlation( - input_data2, scores1, correction=correction, alpha=alpha - ) + cov_exp = self._squared_covariance() ** (0.5) + cov_exp.name = "pseudo_explained_covariance" + return cov_exp - patterns1.name = "left_heterogeneous_patterns" - patterns2.name = "right_heterogeneous_patterns" - - pvals1.name = "pvalues_of_left_heterogeneous_patterns" - pvals2.name = "pvalues_of_right_heterogeneous_patterns" - - patterns1 = self.preprocessor1.inverse_transform_components(patterns1) - patterns2 = self.preprocessor2.inverse_transform_components(patterns2) - - pvals1 = self.preprocessor1.inverse_transform_components(pvals1) - pvals2 = self.preprocessor2.inverse_transform_components(pvals2) + def _total_covariance(self) -> DataArray: + """Get the total covariance. - return (patterns1, patterns2), (pvals1, pvals2) + This measure follows the defintion of Cheng and Dunkerton (1995). + Note that this measure is not an invariant in MCA. - def _validate_loaded_data(self, data: xr.DataArray): - if data.attrs.get("placeholder"): - warnings.warn( - f"The input data field '{data.name}' was not saved, which will produce" - " empty results when calling `homogeneous_patterns()` or " - "`heterogeneous_patterns()`. To avoid this warning, you can save the" - " model with `save_data=True`, or add the data manually by running" - " it through the model's `preprocessor.transform()` method and then" - " attaching it with `data.add()`." - ) + """ + pseudo_tot_cov = self._covariance_explained_DC95().sum() + pseudo_tot_cov.name = "pseudo_total_covariance" + return pseudo_tot_cov -class ComplexMCA(MCA): - """Complex Maximum Covariance Analysis. +class ComplexMCA(ComplexCPCCA, MCA): + """Complex MCA. MCA applied to a complex-valued field obtained from a pair of variables such as the zonal and meridional components, :math:`U` and :math:`V`, of the wind @@ -587,39 +244,43 @@ class ComplexMCA(MCA): Parameters ---------- - n_modes: int, default=2 + n_modes : int, default=2 Number of modes to calculate. - center: bool, default=True - Whether to center the input data. - standardize: bool, default=False - Whether to standardize the input data. - use_coslat: bool, default=False - Whether to use cosine of latitude for scaling. - n_pca_modes: int, default=None - The number of principal components to retain during the PCA - preprocessing step applied to both data sets prior to executing MCA. If - set to None, PCA preprocessing will be bypassed, and the MCA will be - performed on the original datasets. Specifying an integer value greater - than 0 for `n_pca_modes` will trigger the PCA preprocessing, retaining - only the specified number of principal components. This reduction in - dimensionality can be especially beneficial when dealing with - high-dimensional data, where computing the cross-covariance matrix can - become computationally intensive or in scenarios where multicollinearity - is a concern. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is then computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. compute : bool, default=True - Whether to compute elements of the model eagerly, or to defer - computation. If True, four pieces of the fit will be computed - sequentially: 1) the preprocessor scaler, 2) optional NaN checks, 3) SVD - decomposition, 4) scores and components. - sample_name: str, default="sample" - Name of the new sample dimension. - feature_name: str, default="feature" - Name of the new feature dimension. - solver: {"auto", "full", "randomized"}, default="auto" - Solver to use for the SVD computation. - random_state: int, default=None + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None Seed for the random number generator. - solver_kwargs: dict, default={} + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} Additional keyword arguments passed to the SVD solver function. Examples @@ -643,307 +304,167 @@ class ComplexMCA(MCA): def __init__( self, n_modes: int = 2, - center: bool = True, - standardize: bool = False, - use_coslat: bool = False, - check_nans: bool = True, - n_pca_modes: Optional[int] = None, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, compute: bool = True, + verbose: bool = False, sample_name: str = "sample", - feature_name: str = "feature", + feature_name: Sequence[str] | str = "feature", solver: str = "auto", - random_state: Optional[int] = None, - solver_kwargs: Dict = {}, - **kwargs, + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, ): - super().__init__( + ComplexCPCCA.__init__( + self, n_modes=n_modes, - center=center, + alpha=[1.0, 1.0], standardize=standardize, use_coslat=use_coslat, check_nans=check_nans, + use_pca=use_pca, n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, compute=compute, + verbose=verbose, sample_name=sample_name, feature_name=feature_name, solver=solver, random_state=random_state, solver_kwargs=solver_kwargs, - **kwargs, ) self.attrs.update({"model": "Complex MCA"}) + # Renove alpha from the inherited CPCCA serialization params because it is hard-coded for MCA + self._params.pop("alpha") - def _fit_algorithm(self, data1: DataArray, data2: DataArray) -> Self: - if not np.iscomplexobj(data1) and not np.iscomplexobj(data2): - warnings.warn( - "Expected complex-valued data but found real-valued data. For Hilbert MCA, use `HilbertMCA` model." - ) - - return super()._fit_algorithm(data1, data2) - - def components_amplitude(self) -> Tuple[DataObject, DataObject]: - """Compute the amplitude of the components. - - The amplitude of the components are defined as - - .. math:: - A_ij = |C_ij| - - where :math:`C_{ij}` is the :math:`i`-th entry of the :math:`j`-th component and - :math:`|\\cdot|` denotes the absolute value. - - Returns - ------- - DataObject - Amplitude of the left components. - DataObject - Amplitude of the left components. - - """ - comps1 = abs(self.data["components1"]) - comps1.name = "left_components_amplitude" - - comps2 = abs(self.data["components2"]) - comps2.name = "right_components_amplitude" - - comps1 = self.preprocessor1.inverse_transform_components(comps1) - comps2 = self.preprocessor2.inverse_transform_components(comps2) - - return (comps1, comps2) - - def components_phase(self) -> Tuple[DataObject, DataObject]: - """Compute the phase of the components. - - The phase of the components are defined as - - .. math:: - \\phi_{ij} = \\arg(C_{ij}) - - where :math:`C_{ij}` is the :math:`i`-th entry of the :math:`j`-th component and - :math:`\\arg(\\cdot)` denotes the argument of a complex number. - - Returns - ------- - DataObject - Phase of the left components. - DataObject - Phase of the right components. - - """ - comps1 = xr.apply_ufunc(np.angle, self.data["components1"], keep_attrs=True) - comps1.name = "left_components_phase" - comps2 = xr.apply_ufunc(np.angle, self.data["components2"], keep_attrs=True) - comps2.name = "right_components_phase" - - comps1 = self.preprocessor1.inverse_transform_components(comps1) - comps2 = self.preprocessor2.inverse_transform_components(comps2) - - return (comps1, comps2) - - def scores_amplitude(self) -> Tuple[DataArray, DataArray]: - """Compute the amplitude of the scores. - - The amplitude of the scores are defined as - - .. math:: - A_ij = |S_ij| - - where :math:`S_{ij}` is the :math:`i`-th entry of the :math:`j`-th score and - :math:`|\\cdot|` denotes the absolute value. - - Returns - ------- - DataArray - Amplitude of the left scores. - DataArray - Amplitude of the right scores. - - """ - scores1 = abs(self.data["scores1"]) - scores2 = abs(self.data["scores2"]) - - scores1.name = "left_scores_amplitude" - scores2.name = "right_scores_amplitude" - - scores1 = self.preprocessor1.inverse_transform_scores(scores1) - scores2 = self.preprocessor2.inverse_transform_scores(scores2) - return (scores1, scores2) - - def scores_phase(self) -> Tuple[DataArray, DataArray]: - """Compute the phase of the scores. - - The phase of the scores are defined as - - .. math:: - \\phi_{ij} = \\arg(S_{ij}) - - where :math:`S_{ij}` is the :math:`i`-th entry of the :math:`j`-th score and - :math:`\\arg(\\cdot)` denotes the argument of a complex number. - - Returns - ------- - DataArray - Phase of the left scores. - DataArray - Phase of the right scores. - - """ - scores1 = xr.apply_ufunc(np.angle, self.data["scores1"], keep_attrs=True) - scores2 = xr.apply_ufunc(np.angle, self.data["scores2"], keep_attrs=True) +class HilbertMCA(HilbertCPCCA, ComplexMCA): + """Hilbert MCA. - scores1.name = "left_scores_phase" - scores2.name = "right_scores_phase" + Hilbert MCA [1]_ (aka Analytical SVD), extends MCA by + examining amplitude-phase relationships. It augments the input data with its + Hilbert transform, creating a complex-valued field. - scores1 = self.preprocessor1.inverse_transform_scores(scores1) - scores2 = self.preprocessor2.inverse_transform_scores(scores2) + This method solves the following optimization problem: - return (scores1, scores2) + :math:`\\max_{q_x, q_y} \\left( q_x^H X^H Y q_y \\right)` - def homogeneous_patterns(self, correction=None, alpha=0.05): - raise NotImplementedError( - "Compplex MCA does not support homogeneous_patterns method." - ) + subject to the constraints: - def heterogeneous_patterns(self, correction=None, alpha=0.05): - raise NotImplementedError( - "Complex MCA does not support heterogeneous_patterns method." - ) + :math:`q_x^H q_x = 1, \\quad q_y^H q_y = 1` + where :math:`H` denotes the conjugate transpose and :math:`X` and :math:`Y` + are the augmented data matrices. -class HilbertMCA(ComplexMCA): - """Hilbert MCA. + An optional padding with exponentially decaying values can be applied prior + to the Hilbert transform in order to mitigate the impact of spectral + leakage. - Hilbert MCA, also referred to as Analytical SVD (ASVD) by Elipot et al. (2017) [1]_, - enhances traditional MCA by accommodating both amplitude and phase information. - It achieves this by utilizing the Hilbert transform to preprocess the data, - thus allowing for a more comprehensive analysis in the subsequent MCA computation. - - An optional padding with exponentially decaying values can be applied prior to - the Hilbert transform in order to mitigate the impact of spectral leakage. Parameters ---------- - n_modes: int, default=2 + n_modes : int, default=2 Number of modes to calculate. - padding : str, optional - Specifies the method used for padding the data prior to applying the Hilbert - transform. This can help to mitigate the effect of spectral leakage. - Currently, only 'exp' for exponential padding is supported. Default is 'exp'. - decay_factor : float, optional - Specifies the decay factor used in the exponential padding. This parameter - is only used if padding='exp'. The recommended value typically ranges between 0.05 to 0.2 - but ultimately depends on the variability in the data. - A smaller value (e.g. 0.05) is recommended for - data with high variability, while a larger value (e.g. 0.2) is recommended - for data with low variability. Default is 0.2. - center: bool, default=True - Whether to center the input data. - standardize: bool, default=False - Whether to standardize the input data. - use_coslat: bool, default=False - Whether to use cosine of latitude for scaling. - n_pca_modes: int, default=None - The number of principal components to retain during the PCA preprocessing - step applied to both data sets prior to executing MCA. - If set to None, PCA preprocessing will be bypassed, and the MCA will be performed on the original datasets. - Specifying an integer value greater than 0 for `n_pca_modes` will trigger the PCA preprocessing, retaining - only the specified number of principal components. This reduction in dimensionality can be especially beneficial - when dealing with high-dimensional data, where computing the cross-covariance matrix can become computationally - intensive or in scenarios where multicollinearity is a concern. + padding : Sequence[str] | str | None, default="exp" + Padding method for the Hilbert transform. Available options are: - None: + no padding - "exp": exponential decay + decay_factor : Sequence[float] | float, default=0.2 + Decay factor for the exponential padding. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. compute : bool, default=True - Whether to compute elements of the model eagerly, or to defer computation. - If True, four pieces of the fit will be computed sequentially: 1) the - preprocessor scaler, 2) optional NaN checks, 3) SVD decomposition, 4) scores - and components. - sample_name: str, default="sample" - Name of the new sample dimension. - feature_name: str, default="feature" - Name of the new feature dimension. - solver: {"auto", "full", "randomized"}, default="auto" + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} Solver to use for the SVD computation. - random_state: int, optional - Random state for randomized SVD solver. - solver_kwargs: dict, default={} - Additional keyword arguments passed to the SVD solver. - - Notes - ----- - Hilbert MCA extends MCA to complex-valued data that contain both magnitude and phase information. - The Hilbert transform is used to transform real-valued data to complex-valued data, from which both - amplitude and phase can be extracted. - - Similar to MCA, Hilbert MCA is used in climate science to identify coupled patterns of variability - between two different climate variables. But unlike MCA, Hilbert MCA can identify coupled patterns - that involve phase shifts. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. References ---------- - .. [1] Elipot, S., Frajka-Williams, E., Hughes, C. W., Olhede, S. & Lankhorst, M. Observed Basin-Scale Response of the North Atlantic Meridional Overturning Circulation to Wind Stress Forcing. Journal of Climate 30, 2029–2054 (2017). + .. [1] Elipot, S., Frajka-Williams, E., Hughes, C. W., Olhede, S. & + Lankhorst, M. Observed Basin-Scale Response of the North Atlantic + Meridional Overturning Circulation to Wind Stress Forcing. Journal of + Climate 30, 2029–2054 (2017). Examples -------- - >>> model = HilbertMCA(n_modes=5, standardize=True) - >>> model.fit(data1, data2) + >>> model = HilbertMCA(n_modes=5) + >>> model.fit(X, Y, "time") """ def __init__( self, n_modes: int = 2, - padding: str = "exp", - decay_factor: float = 0.2, - center: bool = True, - standardize: bool = False, - use_coslat: bool = False, - check_nans: bool = True, - n_pca_modes: Optional[int] = None, + padding: Sequence[str] | str | None = "exp", + decay_factor: Sequence[float] | float = 0.2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, compute: bool = True, + verbose: bool = False, sample_name: str = "sample", - feature_name: str = "feature", + feature_name: Sequence[str] | str = "feature", solver: str = "auto", - random_state: Optional[bool] = None, - solver_kwargs: Dict = {}, - **kwargs, + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, ): - super().__init__( + HilbertCPCCA.__init__( + self, n_modes=n_modes, - center=center, + alpha=[1.0, 1.0], standardize=standardize, use_coslat=use_coslat, check_nans=check_nans, + use_pca=use_pca, n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, compute=compute, + verbose=verbose, sample_name=sample_name, feature_name=feature_name, solver=solver, random_state=random_state, solver_kwargs=solver_kwargs, - **kwargs, + padding=padding, + decay_factor=decay_factor, ) self.attrs.update({"model": "Hilbert MCA"}) - self._params.update({"padding": padding, "decay_factor": decay_factor}) - - def _augment_data(self, data, dims): - kwargs = dict( - padding=self._params["padding"], decay_factor=self._params["decay_factor"] - ) - return hilbert_transform(data, dims=dims, **kwargs) - - def _fit_algorithm(self, data1: DataArray, data2: DataArray) -> Self: - assert_not_complex(data1) - assert_not_complex(data2) - - MCA._fit_algorithm(self, data1, data2) - return self - - def transform(self, data1: DataObject, data2: DataObject): - raise NotImplementedError("Hilbert MCA does not support transform method.") - - def _inverse_transform_algorithm(self, scores1: DataArray, scores2: DataArray): - data1, data2 = super()._inverse_transform_algorithm(scores1, scores2) - - # Enforce real output - return data1.real, data2.real + # Renove alpha from the inherited CPCCA serialization params because it is hard-coded for MCA + self._params.pop("alpha") diff --git a/xeofs/models/mca_rotator.py b/xeofs/models/mca_rotator.py index 9fba447..4ee0205 100644 --- a/xeofs/models/mca_rotator.py +++ b/xeofs/models/mca_rotator.py @@ -1,62 +1,55 @@ -from datetime import datetime -from typing import Dict, List - -import numpy as np -import xarray as xr -from typing_extensions import Self - -from .._version import __version__ -from ..data_container import DataContainer -from ..preprocessing.preprocessor import Preprocessor -from ..utils.data_types import DataArray, DataObject -from ..utils.rotation import promax -from ..utils.xarray_utils import argsort_dask, get_deterministic_sign_multiplier +from .cpcca_rotator import ComplexCPCCARotator, CPCCARotator, HilbertCPCCARotator from .mca import MCA, ComplexMCA, HilbertMCA -class MCARotator(MCA): +class MCARotator(CPCCARotator, MCA): """Rotate a solution obtained from ``xe.models.MCA``. - Rotated MCA [1]_ is an extension of the standard MCA that applies an additional rotation - to the computed modes to maximize the variance explained individually by each mode. - This rotation method enhances interpretability by distributing the explained variance more - evenly among the modes, making it easier to discern patterns within the data. + Rotate the obtained components and scores of a CPCCA model to increase + interpretability. The algorithm here is based on the approach of Cheng & + Dunkerton (1995) [1]_. Parameters ---------- n_modes : int, default=10 Specify the number of modes to be rotated. power : int, default=1 - Set the power for the Promax rotation. A ``power`` value of 1 results - in a Varimax rotation. + Set the power for the Promax rotation. A ``power`` value of 1 results in + a Varimax rotation. max_iter : int or None, default=None Determine the maximum number of iterations for the computation of the rotation matrix. If not specified, defaults to 1000 if ``compute=True`` - and 100 if ``compute=False``, since we can't terminate a lazy computation - based using ``rtol``. + and 100 if ``compute=False``, since we can't terminate a lazy + computation based using ``rtol``. rtol : float, default=1e-8 Define the relative tolerance required to achieve convergence and terminate the iterative process. - squared_loadings : bool, default=False - Specify the method for constructing the combined vectors of loadings. If True, - the combined vectors are loaded with the singular values (termed "squared loadings"), - conserving the squared covariance under rotation. This allows estimation of mode importance - after rotation. If False, the combined vectors are loaded with the square root of the - singular values, following the method described by Cheng & Dunkerton. compute : bool, default=True Whether to compute the rotation immediately. References ---------- - .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns Derived from Singular Value Decomposition Analysis. J. Climate 8, 2631–2643 (1995). + .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). Examples -------- - >>> model = MCA(n_modes=5) - >>> model.fit(da1, da2, dim='time') - >>> rotator = MCARotator(n_modes=5, power=2) + + Perform a MCA: + + >>> model = MCA(n_modes=10) + >>> model.fit(X, Y, dim='time') + + Then, apply varimax rotation to first 5 components and scores: + + >>> rotator = MCARotator(n_modes=5) >>> rotator.fit(model) + + Retrieve the rotated components and scores: + >>> rotator.components() + >>> rotator.scores() """ @@ -66,454 +59,171 @@ def __init__( power: int = 1, max_iter: int | None = None, rtol: float = 1e-8, - squared_loadings: bool = False, compute: bool = True, ): - if max_iter is None: - max_iter = 1000 if compute else 100 - - # Define model parameters - self._params = { - "n_modes": n_modes, - "power": power, - "max_iter": max_iter, - "rtol": rtol, - "squared_loadings": squared_loadings, - "compute": compute, - } - - # Define analysis-relevant meta data - self.attrs = {"model": "Rotated MCA"} - self.attrs.update(self._params) - self.attrs.update( - { - "software": "xeofs", - "version": __version__, - "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } + CPCCARotator.__init__( + self, + n_modes=n_modes, + power=power, + max_iter=max_iter, + rtol=rtol, + compute=compute, ) - # Attach empty objects - self.preprocessor1 = Preprocessor() - self.preprocessor2 = Preprocessor() - self.data = DataContainer() + # Define analysis-relevant meta data + self.attrs.update({"model": "Rotated MCA"}) self.model = MCA() - self.sorted = False - - def get_serialization_attrs(self) -> Dict: - return dict( - data=self.data, - preprocessor1=self.preprocessor1, - preprocessor2=self.preprocessor2, - model=self.model, - sorted=self.sorted, - ) - - def _compute_rot_mat_inv_trans(self, rotation_matrix, input_dims) -> xr.DataArray: - """Compute the inverse transpose of the rotation matrix. - - For orthogonal rotations (e.g., Varimax), the inverse transpose is equivalent - to the rotation matrix itself. For oblique rotations (e.g., Promax), the simplification - does not hold. - - Returns - ------- - rotation_matrix : xr.DataArray - - """ - if self._params["power"] > 1: - # inverse matrix - rotation_matrix = xr.apply_ufunc( - np.linalg.inv, - rotation_matrix, - input_core_dims=[(input_dims)], - output_core_dims=[(input_dims[::-1])], - vectorize=False, - dask="allowed", - ) - # transpose matrix - rotation_matrix = rotation_matrix.conj().transpose(*input_dims) - return rotation_matrix - - def fit(self, model: MCA) -> Self: - """Rotate the solution obtained from ``xe.models.MCA``. - - Parameters - ---------- - model : ``xe.models.MCA`` - The MCA model to be rotated. - - """ - self._fit_algorithm(model) - - if self._params["compute"]: - self.compute() - - return self - - def _fit_algorithm(self, model) -> Self: - self.model = model - self.preprocessor1 = model.preprocessor1 - self.preprocessor2 = model.preprocessor2 - self.sample_name = self.model.sample_name - self.feature_name = self.model.feature_name - self.sorted = False - - n_modes = self._params["n_modes"] - power = self._params["power"] - max_iter = self._params["max_iter"] - rtol = self._params["rtol"] - use_squared_loadings = self._params["squared_loadings"] - - # Construct the combined vector of loadings - # NOTE: In the methodology used by Cheng & Dunkerton (CD), the combined vectors are "loaded" or weighted - # with the square root of the singular values, akin to what is done in standard Varimax rotation. This method - # ensures that the total amount of covariance is conserved during the rotation process. - # However, in Maximum Covariance Analysis (MCA), the focus is usually on the squared covariance to determine - # the importance of a given mode. The approach adopted by CD does not preserve the squared covariance under - # rotation, making it impossible to estimate the importance of modes post-rotation. - # To resolve this issue, one possible workaround is to rotate the singular vectors that have been "loaded" - # or weighted with the singular values ("squared loadings"), as opposed to the square root of the singular values. - # In doing so, the squared covariance remains conserved under rotation, allowing for the estimation of the - # modes' importance. - norm1 = self.model.data["norm1"].sel(mode=slice(1, n_modes)) - norm2 = self.model.data["norm2"].sel(mode=slice(1, n_modes)) - if use_squared_loadings: - # Squared loadings approach conserving squared covariance - scaling = norm1 * norm2 - else: - # Cheng & Dunkerton approach conserving covariance - scaling = np.sqrt(norm1 * norm2) - - comps1 = self.model.data["components1"].sel(mode=slice(1, n_modes)) - comps2 = self.model.data["components2"].sel(mode=slice(1, n_modes)) - loadings = xr.concat([comps1, comps2], dim=self.feature_name) * scaling - - # Rotate loadings - promax_kwargs = {"power": power, "max_iter": max_iter, "rtol": rtol} - rot_loadings, rot_matrix, phi_matrix = promax( - loadings=loadings, - feature_dim=self.feature_name, - compute=self._params["compute"], - **promax_kwargs, - ) - - # Assign coordinates to the rotation/correlation matrices - rot_matrix = rot_matrix.assign_coords( - mode_m=np.arange(1, rot_matrix.mode_m.size + 1), - mode_n=np.arange(1, rot_matrix.mode_n.size + 1), - ) - phi_matrix = phi_matrix.assign_coords( - mode_m=np.arange(1, phi_matrix.mode_m.size + 1), - mode_n=np.arange(1, phi_matrix.mode_n.size + 1), - ) - - # Rotated (loaded) singular vectors - comps1_rot = rot_loadings.isel( - {self.feature_name: slice(0, comps1.coords[self.feature_name].size)} - ) - comps2_rot = rot_loadings.isel( - {self.feature_name: slice(comps1.coords[self.feature_name].size, None)} - ) - - # Normalization factor of singular vectors - norm1_rot = xr.apply_ufunc( - np.linalg.norm, - comps1_rot, - input_core_dims=[[self.feature_name, "mode"]], - output_core_dims=[["mode"]], - exclude_dims={self.feature_name}, - kwargs={"axis": 0}, - vectorize=False, - dask="allowed", - ) - norm2_rot = xr.apply_ufunc( - np.linalg.norm, - comps2_rot, - input_core_dims=[[self.feature_name, "mode"]], - output_core_dims=[["mode"]], - exclude_dims={self.feature_name}, - kwargs={"axis": 0}, - vectorize=False, - dask="allowed", - ) - - # Rotated (normalized) singular vectors - comps1_rot = comps1_rot / norm1_rot - comps2_rot = comps2_rot / norm2_rot - - # Remove the squaring introduced by the squared loadings approach - if use_squared_loadings: - norm1_rot = norm1_rot**0.5 - norm2_rot = norm2_rot**0.5 - - # norm1 * norm2 = "singular values" - squared_covariance = (norm1_rot * norm2_rot) ** 2 - - # Reorder according to squared covariance - idx_modes_sorted = argsort_dask(squared_covariance, "mode")[::-1] - idx_modes_sorted.coords.update(squared_covariance.coords) - - # Rotate scores using rotation matrix - scores1 = self.model.data["scores1"].sel(mode=slice(1, n_modes)) - scores2 = self.model.data["scores2"].sel(mode=slice(1, n_modes)) - RinvT = self._compute_rot_mat_inv_trans( - rot_matrix, input_dims=("mode_m", "mode_n") - ) - # Rename dimension mode to ensure that dot product has dimensions (sample x mode) as output - scores1 = scores1.rename({"mode": "mode_m"}) - scores2 = scores2.rename({"mode": "mode_m"}) - RinvT = RinvT.rename({"mode_n": "mode"}) - scores1_rot = xr.dot(scores1, RinvT, dims="mode_m") - scores2_rot = xr.dot(scores2, RinvT, dims="mode_m") - - # Ensure consitent signs for deterministic output - modes_sign = get_deterministic_sign_multiplier(rot_loadings, self.feature_name) - comps1_rot = comps1_rot * modes_sign - comps2_rot = comps2_rot * modes_sign - scores1_rot = scores1_rot * modes_sign - scores2_rot = scores2_rot * modes_sign - - # Create data container - self.data.add( - name="input_data1", data=self.model.data["input_data1"], allow_compute=False - ) - self.data.add( - name="input_data2", data=self.model.data["input_data2"], allow_compute=False - ) - self.data.add(name="components1", data=comps1_rot) - self.data.add(name="components2", data=comps2_rot) - self.data.add(name="scores1", data=scores1_rot) - self.data.add(name="scores2", data=scores2_rot) - self.data.add(name="squared_covariance", data=squared_covariance) - self.data.add( - name="total_squared_covariance", - data=self.model.data["total_squared_covariance"], - ) - self.data.add(name="idx_modes_sorted", data=idx_modes_sorted) - self.data.add(name="norm1", data=norm1_rot) - self.data.add(name="norm2", data=norm2_rot) - self.data.add(name="rotation_matrix", data=rot_matrix) - self.data.add(name="phi_matrix", data=phi_matrix) - self.data.add(name="modes_sign", data=modes_sign) - - # Assign analysis-relevant meta data - self.data.set_attrs(self.attrs) - - return self - - def _post_compute(self): - """Leave sorting until after compute because it can't be done lazily.""" - self._sort_by_variance() - - def _sort_by_variance(self): - """Re-sort the mode dimension of all data variables by variance explained.""" - if not self.sorted: - for key in self.data.keys(): - if "mode" in self.data[key].dims and key != "idx_modes_sorted": - self.data[key] = ( - self.data[key] - .isel(mode=self.data["idx_modes_sorted"].values) - .assign_coords(mode=self.data[key].mode) - ) - self.sorted = True - - def transform( - self, data1: DataObject | None = None, data2: DataObject | None = None - ) -> DataArray | List[DataArray]: - """Project new "unseen" data onto the rotated singular vectors. - - Parameters - ---------- - data1 : DataArray | Dataset | List[DataArray] - Data to be projected onto the rotated singular vectors of the first dataset. - data2 : DataArray | Dataset | List[DataArray] - Data to be projected onto the rotated singular vectors of the second dataset. - - Returns - ------- - DataArray | List[DataArray] - Projected data. - - """ - # raise error if no data is provided - if data1 is None and data2 is None: - raise ValueError("No data provided. Please provide data1 and/or data2.") - - n_modes = self._params["n_modes"] - rot_matrix = self.data["rotation_matrix"] - RinvT = self._compute_rot_mat_inv_trans( - rot_matrix, input_dims=("mode_m", "mode_n") - ) - RinvT = RinvT.rename({"mode_n": "mode"}) - - results = [] - - if data1 is not None: - # Select the (non-rotated) singular vectors of the first dataset - comps1 = self.model.data["components1"].sel(mode=slice(1, n_modes)) - norm1 = self.model.data["norm1"].sel(mode=slice(1, n_modes)) - - # Preprocess the data - data1 = self.preprocessor1.transform(data1) - - # Compute non-rotated scores by projecting the data onto non-rotated components - projections1 = xr.dot(data1, comps1) / norm1 - # Rotate the scores - projections1 = projections1.rename({"mode": "mode_m"}) - projections1 = xr.dot(projections1, RinvT, dims="mode_m") - # Reorder according to variance - if self.sorted: - projections1 = projections1.isel( - mode=self.data["idx_modes_sorted"].values - ).assign_coords(mode=projections1.mode) - # Adapt the sign of the scores - projections1 = projections1 * self.data["modes_sign"] - - # Unstack the projections - projections1 = self.preprocessor1.inverse_transform_scores(projections1) - - results.append(projections1) - - if data2 is not None: - # Select the (non-rotated) singular vectors of the second dataset - comps2 = self.model.data["components2"].sel(mode=slice(1, n_modes)) - norm2 = self.model.data["norm2"].sel(mode=slice(1, n_modes)) - - # Preprocess the data - data2 = self.preprocessor2.transform(data2) - - # Compute non-rotated scores by project the data onto non-rotated components - projections2 = xr.dot(data2, comps2) / norm2 - # Rotate the scores - projections2 = projections2.rename({"mode": "mode_m"}) - projections2 = xr.dot(projections2, RinvT, dims="mode_m") - # Reorder according to variance - if self.sorted: - projections2 = projections2.isel( - mode=self.data["idx_modes_sorted"].values - ).assign_coords(mode=projections2.mode) - # Determine the sign of the scores - projections2 = projections2 * self.data["modes_sign"] - - # Unstack the projections - projections2 = self.preprocessor2.inverse_transform_scores(projections2) - - results.append(projections2) - - if len(results) == 0: - raise ValueError("provide at least one of [`data1`, `data2`]") - elif len(results) == 1: - return results[0] - else: - return results - - -class ComplexMCARotator(MCARotator, ComplexMCA): +class ComplexMCARotator(ComplexCPCCARotator, ComplexMCA): """Rotate a solution obtained from ``xe.models.ComplexMCA``. + Rotate the obtained components and scores of a CPCCA model to increase + interpretability. The algorithm here is based on the approach of Cheng & + Dunkerton (1995) [1]_, Elipot et al. (2017) [2]_ and Rieger et al. (2021). + Parameters ---------- n_modes : int, default=10 Specify the number of modes to be rotated. power : int, default=1 - Set the power for the Promax rotation. A ``power`` value of 1 results - in a Varimax rotation. + Set the power for the Promax rotation. A ``power`` value of 1 results in + a Varimax rotation. max_iter : int, default=1000 Determine the maximum number of iterations for the computation of the rotation matrix. rtol : float, default=1e-8 Define the relative tolerance required to achieve convergence and terminate the iterative process. - squared_loadings : bool, default=False - Specify the method for constructing the combined vectors of loadings. If True, - the combined vectors are loaded with the singular values (termed "squared loadings"), - conserving the squared covariance under rotation. This allows estimation of mode importance - after rotation. If False, the combined vectors are loaded with the square root of the - singular values, following the method described by Cheng & Dunkerton. compute: bool, default=True Whether to compute the rotation immediately. + References + ---------- + .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). + .. [2] Elipot, S., Frajka-Williams, E., Hughes, C. W., Olhede, S. & + Lankhorst, M. Observed Basin-Scale Response of the North Atlantic + Meridional Overturning Circulation to Wind Stress Forcing. Journal of + Climate 30, 2029–2054 (2017). + .. [3] Rieger, N., Corral, Á., Olmedo, E. & Turiel, A. Lagged + Teleconnections of Climate Variables Identified via Complex Rotated + Maximum Covariance Analysis. Journal of Climate 34, 9861–9878 (2021). + + Examples -------- - >>> model = ComplexMCA(n_modes=5) - >>> model.fit(da1, da2, dim='time') - >>> rotator = ComplexMCARotator(n_modes=5, power=2) + + Perform a Complex MCA: + + >>> model = ComplexMCA(n_modes=10) + >>> model.fit(X, Y, dim='time') + + Then, apply varimax rotation to first 5 components and scores: + + >>> rotator = ComplexMCARotator(n_modes=5) >>> rotator.fit(model) + + Retrieve the rotated components and scores: + >>> rotator.components() + >>> rotator.scores() """ - def __init__(self, **kwargs): - super().__init__(**kwargs) + def __init__( + self, + n_modes: int = 10, + power: int = 1, + max_iter: int | None = None, + rtol: float = 1e-8, + compute: bool = True, + ): + ComplexCPCCARotator.__init__( + self, + n_modes=n_modes, + power=power, + max_iter=max_iter, + rtol=rtol, + compute=compute, + ) self.attrs.update({"model": "Rotated Complex MCA"}) self.model = ComplexMCA() -class HilbertMCARotator(MCARotator, HilbertMCA): +class HilbertMCARotator(HilbertCPCCARotator, HilbertMCA): """Rotate a solution obtained from ``xe.models.HilbertMCA``. - Hilbert Rotated MCA [1]_ [2]_ [3]_ extends MCA by incorporating both amplitude and phase information - using a Hilbert transform prior to performing MCA and subsequent Varimax or Promax rotation. - This adds a further layer of dimensionality to the analysis, allowing for a more nuanced interpretation - of complex relationships within the data, particularly useful when analyzing oscillatory data. + Rotate the obtained components and scores of a CPCCA model to increase + interpretability. The algorithm here is based on the approach of Cheng & + Dunkerton (1995) [1]_, Elipot et al. (2017) [2]_ and Rieger et al. (2021). Parameters ---------- n_modes : int, default=10 Specify the number of modes to be rotated. power : int, default=1 - Set the power for the Promax rotation. A ``power`` value of 1 results - in a Varimax rotation. + Set the power for the Promax rotation. A ``power`` value of 1 results in + a Varimax rotation. max_iter : int, default=1000 Determine the maximum number of iterations for the computation of the rotation matrix. rtol : float, default=1e-8 Define the relative tolerance required to achieve convergence and terminate the iterative process. - squared_loadings : bool, default=False - Specify the method for constructing the combined vectors of loadings. If True, - the combined vectors are loaded with the singular values (termed "squared loadings"), - conserving the squared covariance under rotation. This allows estimation of mode importance - after rotation. If False, the combined vectors are loaded with the square root of the - singular values, following the method described by Cheng & Dunkerton. compute: bool, default=True Whether to compute the rotation immediately. References ---------- - .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns Derived from Singular Value Decomposition Analysis. J. Climate 8, 2631–2643 (1995). - .. [2] Elipot, S., Frajka-Williams, E., Hughes, C. W., Olhede, S. & Lankhorst, M. Observed Basin-Scale Response of the North Atlantic Meridional Overturning Circulation to Wind Stress Forcing. Journal of Climate 30, 2029–2054 (2017). - .. [3] Rieger, N., Corral, Á., Olmedo, E. & Turiel, A. Lagged Teleconnections of Climate Variables Identified via Complex Rotated Maximum Covariance Analysis. Journal of Climate 34, 9861–9878 (2021). - + .. [1] Cheng, X. & Dunkerton, T. J. Orthogonal Rotation of Spatial Patterns + Derived from Singular Value Decomposition Analysis. J. Climate 8, + 2631–2643 (1995). + .. [2] Elipot, S., Frajka-Williams, E., Hughes, C. W., Olhede, S. & + Lankhorst, M. Observed Basin-Scale Response of the North Atlantic + Meridional Overturning Circulation to Wind Stress Forcing. Journal of + Climate 30, 2029–2054 (2017). + .. [3] Rieger, N., Corral, Á., Olmedo, E. & Turiel, A. Lagged + Teleconnections of Climate Variables Identified via Complex Rotated + Maximum Covariance Analysis. Journal of Climate 34, 9861–9878 (2021). Examples -------- - >>> model = HilbertMCA(n_modes=5) - >>> model.fit(da1, da2, dim='time') - >>> rotator = HilbertMCARotator(n_modes=5, power=2) + + Perform a Hilbert MCA: + + >>> model = HilbertMCA(n_modes=10) + >>> model.fit(X, Y, dim='time') + + Then, apply varimax rotation to first 5 components and scores: + + >>> rotator = HilbertMCARotator(n_modes=5) >>> rotator.fit(model) + + Retrieve the rotated components and scores: + >>> rotator.components() + >>> rotator.scores() """ - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.attrs.update({"model": "Hilbert Rotated MCA"}) + def __init__( + self, + n_modes: int = 10, + power: int = 1, + max_iter: int | None = None, + rtol: float = 1e-8, + compute: bool = True, + ): + HilbertCPCCARotator.__init__( + self, + n_modes=n_modes, + power=power, + max_iter=max_iter, + rtol=rtol, + compute=compute, + ) + self.attrs.update({"model": "Rotated Hilbert MCA"}) self.model = HilbertMCA() - - def transform(self, **kwargs): - # Here we make use of the Method Resolution Order (MRO) to call the - # transform method of the first class in the MRO after `MCARotator` - # that has a transform method. In this case it will be `HilbertMCA`, - # which will raise an error because it does not have a transform method. - super(MCARotator, self).transform(**kwargs) - - def homogeneous_patterns(self, **kwargs): - super(MCARotator, self).homogeneous_patterns(**kwargs) - - def heterogeneous_patterns(self, **kwargs): - super(MCARotator, self).homogeneous_patterns(**kwargs) diff --git a/xeofs/models/opa.py b/xeofs/models/opa.py index 08a4b53..a7172f4 100644 --- a/xeofs/models/opa.py +++ b/xeofs/models/opa.py @@ -1,17 +1,17 @@ -from typing import Optional, Dict -from typing_extensions import Self +from typing import Dict, Optional -import xarray as xr import numpy as np +import xarray as xr +from typing_extensions import Self -from ._base_model import _BaseModel -from .eof import EOF -from .decomposer import Decomposer -from ..utils.data_types import DataObject, DataArray +from ..utils.data_types import DataArray, DataObject from ..utils.sanity_checks import assert_not_complex +from ._base_model_single_set import _BaseModelSingleSet +from .decomposer import Decomposer +from .eof import EOF -class OPA(_BaseModel): +class OPA(_BaseModelSingleSet): """Optimal Persistence Analysis. Optimal Persistence Analysis (OPA) [1]_ [2]_ identifies the patterns with the diff --git a/xeofs/models/sparse_pca.py b/xeofs/models/sparse_pca.py index 27c9299..025877c 100644 --- a/xeofs/models/sparse_pca.py +++ b/xeofs/models/sparse_pca.py @@ -9,11 +9,11 @@ from ..utils.sanity_checks import assert_not_complex from ..utils.xarray_utils import get_matrix_rank from ..utils.xarray_utils import total_variance as compute_total_variance -from ._base_model import _BaseModel +from ._base_model_single_set import _BaseModelSingleSet from ._np_classes._sparse_pca import compute_rspca, compute_spca -class SparsePCA(_BaseModel): +class SparsePCA(_BaseModelSingleSet): """ Sparse PCA via Variable Projection. diff --git a/xeofs/models/svd.py b/xeofs/models/svd.py new file mode 100644 index 0000000..7f228a0 --- /dev/null +++ b/xeofs/models/svd.py @@ -0,0 +1,81 @@ +import numpy as np +import xarray as xr +from dask.base import compute as dask_compute + +from ..utils.data_types import DataArray +from ._np_classes._svd import _SVD + + +class SVD: + def __init__( + self, + n_modes: int | float | str, + init_rank_reduction: float = 0.3, + flip_signs: bool = True, + solver: str = "auto", + compute: bool = True, + random_state: np.random.Generator | int | None = None, + verbose: bool = False, + solver_kwargs: dict = {}, + sample_name: str = "sample", + feature_name: str = "feature", + ): + self.n_modes = n_modes + self.init_rank_reduction = init_rank_reduction + self.flip_signs = flip_signs + self.solver = solver + self.random_state = random_state + self.verbose = verbose + self.solver_kwargs = solver_kwargs + self.compute_svd = compute + + self.sample_name = sample_name + self.feature_name = feature_name + + def fit_transform(self, X: DataArray) -> tuple[DataArray, DataArray, DataArray]: + """Decomposes the data object. + + Parameters + ---------- + X : DataArray + A 2-dimensional data object to be decomposed. + + Returns + ------- + U : DataArray + The left singular vectors of the decomposition. + s : DataArray + The singular values of the decomposition. + V : DataArray + The right singular vectors of the decomposition. + + """ + svd = _SVD( + n_modes=self.n_modes, + init_rank_reduction=self.init_rank_reduction, + flip_signs=self.flip_signs, + solver=self.solver, + random_state=self.random_state, + verbose=self.verbose, + **self.solver_kwargs, + ) + U, s, V = xr.apply_ufunc( + svd.fit_transform, + X, + input_core_dims=[[self.sample_name, self.feature_name]], + output_core_dims=[ + [self.sample_name, "mode"], + ["mode"], + [self.feature_name, "mode"], + ], + dask="allowed", + ) + mode_coords = np.arange(1, U.mode.size + 1) + s = s.assign_coords(mode=mode_coords) + U = U.assign_coords(mode=mode_coords) + V = V.assign_coords(mode=mode_coords) + + if self.compute_svd: + U, s, V = dask_compute(U, s, V) + + return U, s, V diff --git a/xeofs/preprocessing/pca_preprocessor.py b/xeofs/preprocessing/pca_preprocessor.py deleted file mode 100644 index 8a091e3..0000000 --- a/xeofs/preprocessing/pca_preprocessor.py +++ /dev/null @@ -1,155 +0,0 @@ -from typing import List, Optional, Tuple - -import numpy as np -from typing_extensions import Self - -from ..utils.data_types import ( - Data, - Dims, - DimsList, -) -from .concatenator import Concatenator -from .dimension_renamer import DimensionRenamer -from .multi_index_converter import MultiIndexConverter -from .preprocessor import Preprocessor -from .sanitizer import Sanitizer -from .scaler import Scaler -from .stacker import Stacker -from .whitener import Whitener - - -def extract_new_dim_names(X: List[DimensionRenamer]) -> Tuple[Dims, DimsList]: - """Extract the new dimension names from a list of DimensionRenamer objects. - - Parameters - ---------- - X : list of DimensionRenamer - List of DimensionRenamer objects. - - Returns - ------- - Dims - Sample dimensions - DimsList - Feature dimenions - - """ - new_sample_dims = [] - new_feature_dims: DimsList = [] - for x in X: - new_sample_dims.append(x.sample_dims_after) - new_feature_dims.append(x.feature_dims_after) - new_sample_dims: Dims = tuple(np.unique(np.asarray(new_sample_dims)).tolist()) - return new_sample_dims, new_feature_dims - - -class PCAPreprocessor(Preprocessor): - """Preprocess xarray objects and transform into (whitened) PC space. - - PCA-Preprocesser includes steps from Preprocessor class: - (i) Feature-wise scaling (e.g. removing mean, dividing by standard deviation, applying (latitude) weights - (ii) Renaming dimensions (to avoid conflicts with sample and feature dimensions) - (iii) Converting MultiIndexes to regular Indexes (MultiIndexes cannot be stacked) - (iv) Stacking the data into 2D DataArray - (v) Converting MultiIndexes introduced by stacking into regular Indexes - (vi) Removing NaNs - (vii) Concatenating the 2D DataArrays into one 2D DataArray - (viii) Transform into whitened PC space - - - Parameters - ---------- - n_modes : int | float - If int, specifies the number of modes to retain. If float, specifies the fraction of variance in the whitened data that should be explained by the retained modes. - alpha : float, default=0.0 - Degree of whitening. If 0, the data is completely whitened. If 1, the data is not whitened. - init_rank_reduction : float, default=0.3 - Fraction of the initial rank to reduce the data to before applying PCA. - sample_name : str, default="sample" - Name of the sample dimension. - feature_name : str, default="feature" - Name of the feature dimension. - with_center : bool, default=True - If True, the data is centered by subtracting the mean. - with_std : bool, default=True - If True, the data is divided by the standard deviation. - with_coslat : bool, default=False - If True, the data is multiplied by the square root of cosine of latitude weights. - with_weights : bool, default=False - If True, the data is multiplied by additional user-defined weights. - return_list : bool, default=True - If True, inverse_transform methods returns always a list of DataArray(s). - If False, the output is returned as a single DataArray if possible. - check_nans : bool, default=True - If True, remove full-dimensional NaN features from the data, check to ensure - that NaN features match the original fit data during transform, and check - for isolated NaNs. Note: this forces eager computation of dask arrays. - If False, skip all NaN checks. In this case, NaNs should be explicitly removed - or filled prior to fitting, or SVD will fail. - - """ - - def __init__( - self, - n_modes: int | float, - alpha: float = 1.0, - init_rank_reduction: float = 0.3, - sample_name: str = "sample", - feature_name: str = "feature", - with_center: bool = True, - with_std: bool = False, - with_coslat: bool = False, - return_list: bool = True, - check_nans: bool = True, - compute: bool = True, - ): - super().__init__( - sample_name=sample_name, - feature_name=feature_name, - with_center=with_center, - with_std=with_std, - with_coslat=with_coslat, - return_list=return_list, - check_nans=check_nans, - compute=compute, - ) - - # Set parameters - self.n_modes = n_modes - self.alpha = alpha - self.init_rank_reduction = init_rank_reduction - - # 8 | PCA-Whitener - self.whitener = Whitener( - n_modes=self.n_modes, - init_rank_reduction=self.init_rank_reduction, - alpha=self.alpha, - sample_name=self.sample_name, - feature_name=self.feature_name, - ) - - def transformer_types(self): - """Ordered list of transformer operations.""" - return dict( - scaler=Scaler, - renamer=DimensionRenamer, - preconverter=MultiIndexConverter, - stacker=Stacker, - postconverter=MultiIndexConverter, - sanitizer=Sanitizer, - concatenator=Concatenator, - whitener=Whitener, - ) - - def _fit_algorithm( - self, - X: List[Data] | Data, - sample_dims: Dims, - weights: Optional[List[Data] | Data] = None, - ) -> Tuple[Self, Data]: - _, X = super()._fit_algorithm(X, sample_dims, weights) - - # 8 | PCA-Whitening - X = self.whitener.fit_transform(X) # type: ignore - - return self, X diff --git a/xeofs/preprocessing/whitener.py b/xeofs/preprocessing/whitener.py index c81c60e..6a106da 100644 --- a/xeofs/preprocessing/whitener.py +++ b/xeofs/preprocessing/whitener.py @@ -5,7 +5,7 @@ import xarray as xr from typing_extensions import Self -from ..models.decomposer import Decomposer +from ..models.svd import SVD from ..utils.data_types import ( DataArray, Dims, @@ -33,10 +33,14 @@ class Whitener(Transformer): If int, number of components to keep. If float, fraction of variance to keep. If `n_modes="all"`, keep all components. init_rank_reduction: float, default=0.3 Used only when `n_modes` is given as a float. Specifiy the initial PCA rank reduction before truncating the solution to the desired fraction of explained variance. Must be in the half open interval ]0, 1]. Lower values will speed up the computation. + compute_svd: bool, default=False + Whether to perform eager or lazy computation. sample_name: str, default="sample" Name of the sample dimension. feature_name: str, default="feature" Name of the feature dimension. + random_state: np.random.Generator | int | None, default=None + Random seed for reproducibility. solver_kwargs: Dict Additional keyword arguments for the SVD solver. @@ -48,8 +52,10 @@ def __init__( use_pca: bool = False, n_modes: int | float | str = "all", init_rank_reduction: float = 0.3, + compute_svd: bool = False, sample_name: str = "sample", feature_name: str = "feature", + random_state: np.random.Generator | int | None = None, solver_kwargs: Dict = {}, ): super().__init__(sample_name, feature_name) @@ -64,6 +70,8 @@ def __init__( self.use_pca = use_pca self.n_modes = n_modes self.init_rank_reduction = init_rank_reduction + self.compute_svd = compute_svd + self.random_state = random_state self.solver_kwargs = solver_kwargs # Check whether Whitener is identity transformation @@ -90,7 +98,7 @@ def _sanity_check_input(self, X) -> None: ) ) - def _get_n_modes(self, X: xr.DataArray) -> int | float: + def _get_n_modes(self, X: DataArray) -> int | float: if isinstance(self.n_modes, str): if self.n_modes == "all": return min(X.shape) @@ -114,7 +122,7 @@ def get_serialization_attrs(self) -> Dict: def fit( self, - X: xr.DataArray, + X: DataArray, sample_dims: Optional[Dims] = None, feature_dims: Optional[DimsList] = None, ) -> Self: @@ -132,16 +140,18 @@ def fit( # In case of "all" modes to the rank of the input data self.n_modes = self._get_n_modes(X) - decomposer = Decomposer( + svd = SVD( n_modes=self.n_modes, init_rank_reduction=self.init_rank_reduction, + compute=self.compute_svd, + random_state=self.random_state, + sample_name=self.sample_name, + feature_name=self.feature_name, **self.solver_kwargs, ) - decomposer.fit(X, dims=(self.sample_name, self.feature_name)) - s = decomposer.s_ - V = decomposer.V_ + _, s, V = svd.fit_transform(X) - n_c = np.sqrt(n_samples - 1) + n_c: float = np.sqrt(n_samples - 1) self.T: DataArray = V * (s / n_c) ** (self.alpha - 1) self.Tinv = (s / n_c) ** (1 - self.alpha) * V.conj().T self.s = s @@ -158,9 +168,7 @@ def fit( return self - def _compute_whitener_transform( - self, X: xr.DataArray - ) -> tuple[DataArray, DataArray]: + def _compute_whitener_transform(self, X: DataArray) -> tuple[DataArray, DataArray]: T, Tinv = xr.apply_ufunc( self._compute_whitener_transform_numpy, X, @@ -176,27 +184,12 @@ def _compute_whitener_transform_numpy(self, X): nc = X.shape[0] - 1 C = X.conj().T @ X / nc power = (self.alpha - 1) / 2 - T = fractional_matrix_power(C, power) + svd_kwargs = {"random_state": self.random_state} + T = fractional_matrix_power(C, power, **svd_kwargs) Tinv = np.linalg.inv(T) return T, Tinv - def _fractional_matrix_power(self, C, power): - V, s, _ = np.linalg.svd(C) - - # cut off small singular values - is_above_zero = s > np.finfo(s.dtype).eps - V = V[:, is_above_zero] - s = s[is_above_zero] - - # TODO: use hermitian=True for numpy>=2.0 - # V, s, _ = np.linalg.svd(C, hermitian=True) - C_scaled = V @ np.diag(s**power) @ V.conj().T - if np.iscomplexobj(C): - return C_scaled - else: - return C_scaled.real - - def get_Tinv(self, unwhiten_only=False) -> xr.DataArray: + def get_Tinv(self, unwhiten_only=False) -> DataArray: """Get the inverse transformation to unwhiten the data without PC transform. In contrast to `inverse_transform()`, this method returns the inverse transformation matrix without the PC transformation. That is, for PC transormed data this transformation only unwhitens the data without transforming back into the input space. For non-PC transformed data, this transformation is equivalent to the inverse transformation. @@ -208,7 +201,7 @@ def get_Tinv(self, unwhiten_only=False) -> xr.DataArray: np.diag, Tinv, input_core_dims=[["mode"]], - output_core_dims=[[self.feature_name, "mode"]], + output_core_dims=[["mode", self.feature_name]], dask="allowed", ) Tinv = Tinv.assign_coords({self.feature_name: self.s.coords["mode"].data}) @@ -216,7 +209,7 @@ def get_Tinv(self, unwhiten_only=False) -> xr.DataArray: else: return self.Tinv - def transform(self, X: xr.DataArray) -> DataArray: + def transform(self, X: DataArray) -> DataArray: """Transform new data into the fractional whitened PC space.""" self._sanity_check_input(X) @@ -229,7 +222,7 @@ def transform(self, X: xr.DataArray) -> DataArray: def fit_transform( self, - X: xr.DataArray, + X: DataArray, sample_dims: Optional[Dims] = None, feature_dims: Optional[DimsList] = None, ) -> DataArray: @@ -240,7 +233,7 @@ def inverse_transform_data(self, X: DataArray, unwhiten_only=False) -> DataArray Parameters ---------- - X: xr.DataArray + X: DataArray Data to transform back into original space. unwhiten_only: bool, default=False If True, only unwhiten the data without transforming back into the input space. This is useful when the data was transformed with PCA before whitening and you need the unwhitened data in the PC space. @@ -252,6 +245,18 @@ def inverse_transform_data(self, X: DataArray, unwhiten_only=False) -> DataArray X = X.rename({self.feature_name: "mode"}) return xr.dot(X, T_inv, dims="mode") + def transform_components(self, X: DataArray) -> DataArray: + """Transform 2D components (feature x mode) into whitened PC space.""" + + if self.is_identity: + return X + else: + dummy_dim = "dummy_dim" + VS = self.T.conj().T + VS = VS.rename({"mode": dummy_dim}) + transformed = xr.dot(VS, X, dims=self.feature_name) + return transformed.rename({dummy_dim: self.feature_name}) + def inverse_transform_components(self, X: DataArray) -> DataArray: """Transform 2D components (feature x mode) from whitened PC space back into original space.""" diff --git a/xeofs/utils/data_types.py b/xeofs/utils/data_types.py index 175eb10..262b822 100644 --- a/xeofs/utils/data_types.py +++ b/xeofs/utils/data_types.py @@ -1,10 +1,10 @@ from typing import ( + Hashable, List, - TypeAlias, Sequence, Tuple, + TypeAlias, TypeVar, - Hashable, ) import dask.array as da @@ -22,6 +22,7 @@ DataList: TypeAlias = List[Data] DataVarList: TypeAlias = List[DataVar] +GenericType = TypeVar("GenericType") DaskArray: TypeAlias = da.Array # type: ignore DataObject: TypeAlias = DataArray | DataSet | DataList diff --git a/xeofs/utils/linalg.py b/xeofs/utils/linalg.py index dba3495..2863850 100644 --- a/xeofs/utils/linalg.py +++ b/xeofs/utils/linalg.py @@ -1,7 +1,9 @@ import numpy as np +from ..models._np_classes._svd import _SVD -def fractional_matrix_power(C, power): + +def fractional_matrix_power(C, power, **kwargs): """Compute the fractional matrix power of a symmetric matrix using SVD. Note: This function is a simplified version of the fractional_matrix_power @@ -11,7 +13,8 @@ def fractional_matrix_power(C, power): if C.shape[0] != C.shape[1]: raise ValueError("Matrix must be square.") - V, s, _ = np.linalg.svd(C) + svd = _SVD(n_modes="all", **kwargs) + _, s, V = svd.fit_transform(C) # cut off small singular values is_above_zero = s > np.finfo(s.dtype).eps diff --git a/xeofs/utils/sanity_checks.py b/xeofs/utils/sanity_checks.py index 49ea2f2..ba03c60 100644 --- a/xeofs/utils/sanity_checks.py +++ b/xeofs/utils/sanity_checks.py @@ -102,15 +102,18 @@ def assert_not_complex(da: xr.DataArray) -> None: ) -def sanity_check_n_modes(n_modes: int | float) -> None: +def sanity_check_n_modes(n_modes: int | float | str) -> None: """Check if the number of modes is valid.""" match n_modes: case int(): if n_modes < 1: - raise ValueError("n_modes must be greater than 0") + raise ValueError("If integer, n_modes must be greater than 0") case float(): if not (0 < n_modes <= 1.0): - raise ValueError("n_modes must be in the range (0, 1]") + raise ValueError("If float, n_modes must be in the range (0, 1]") + case str(): + if n_modes not in ["all"]: + raise ValueError("If string, n_modes must be 'all'") case _: - raise TypeError("n_modes must be an integer or a float") + raise TypeError("n_modes must be an integer, float or string.") diff --git a/xeofs/utils/statistics.py b/xeofs/utils/statistics.py index 1ac6eca..3df80dd 100644 --- a/xeofs/utils/statistics.py +++ b/xeofs/utils/statistics.py @@ -1,11 +1,18 @@ -import xarray as xr import scipy as sc +import xarray as xr from statsmodels.stats.multitest import multipletests as statsmodels_multipletests from .constants import MULTIPLE_TESTS -def pearson_correlation(data1, data2, correction=None, alpha=0.05): +def pearson_correlation( + data1, + data2, + correction=None, + alpha=0.05, + sample_name="sample", + feature_name="feature", +): """Compute Pearson correlation between two xarray objects. Additionally, compute two-sided p-values and adjust them for multiple testing. @@ -40,18 +47,41 @@ def pearson_correlation(data1, data2, correction=None, alpha=0.05): Adjusted p-values for the Pearson correlation. """ + + def _correlation_coefficients_numpy(X, Y): + """Compute Pearson correlation coefficients assuming centered data.""" + X = X / X.std(0) + Y = Y / Y.std(0) + return X.conj().T @ Y / X.shape[0] + n_samples = data1.shape[0] # Compute Pearson correlation coefficients - corr = (data1 * data2).mean("sample") / data1.std("sample") / data2.std("sample") + sample_name_x = "sample_dim_x" + sample_name_y = "sample_dim_y" + data1 = data1.rename({sample_name: sample_name_x}) + data2 = data2.rename({sample_name: sample_name_y}) + feature_name_x = data1.dims[1] + feature_name_y = data2.dims[1] + corr = xr.apply_ufunc( + _correlation_coefficients_numpy, + data1, + data2, + input_core_dims=[ + [sample_name_x, feature_name_x], + [sample_name_y, feature_name_y], + ], + output_core_dims=[[feature_name_x, feature_name_y]], + dask="allowed", + ) # Compute two-sided p-values - pvals = _compute_pvalues(corr, n_samples, dims=["feature"]) + pvals = _compute_pvalues(corr, n_samples) if correction is not None: # Adjust p-values for multiple testing rejected, pvals = _multipletests( - pvals, dim="feature", alpha=alpha, method=correction + pvals, dim=feature_name, alpha=alpha, method=correction ) return corr, pvals @@ -59,7 +89,7 @@ def pearson_correlation(data1, data2, correction=None, alpha=0.05): return corr, pvals -def _compute_pvalues(pearsonr, n_samples: int, dims) -> xr.DataArray: +def _compute_pvalues(pearsonr, n_samples: int) -> xr.DataArray: # Compute two-sided p-values # Reference: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html#r8c6348c62346-1 a = n_samples / 2 - 1 @@ -67,8 +97,8 @@ def _compute_pvalues(pearsonr, n_samples: int, dims) -> xr.DataArray: pvals = 2 * xr.apply_ufunc( dist.cdf, -abs(pearsonr), - input_core_dims=[dims], - output_core_dims=[dims], + input_core_dims=[[]], + output_core_dims=[[]], dask="allowed", vectorize=False, )