Source code for mvlearn.decomposition.grouppca

"""Group Principal Component Analysis."""

# Authors: Pierre Ablin, Hugo Richard
#
# License: MIT

import numpy as np
from scipy import linalg
from sklearn.decomposition import PCA
from sklearn.utils.validation import check_is_fitted


from ..utils.utils import check_Xs
from .base import BaseDecomposer


[docs]class GroupPCA(BaseDecomposer):
    r"""Group Principal Component Analysis.

    As an optional preprocessing, each dataset in `Xs` is reduced with
    usual PCA. Then, datasets are concatenated in the features direction,
    and a PCA is performed on this matrix, yielding a single output dataset.
    Linear coefficients linking the output dataset and each view are computed
    using least squares estimation, which permits to return one dataset per
    view.

    Parameters
    ----------
    n_components : int, optional
        Number of components to extract. If None, n_components is set to
        the minimum number of features in the dataset.

    n_individual_components : int or list of int or 'auto', optional
        The number of individual components to extract as a preprocessing.
        If None, no preprocessing is applied. If an `int`, each dataset
        is reduced to this dimension. If a list, the dataset `i` is
        reduced to the dimension `n_individual_components[i]`.
        If `'auto'`, set to the minimum between n_components and the
        smallest number of features in each dataset.

    multiview_output : bool, optional (default True)
        If True, the `.transform` method returns one dataset per view.
        Otherwise, it returns one dataset, of shape (n_samples, n_components)

    prewhiten : bool, optional (default False)
        Whether the data should be whitened after the original preprocessing.

    whiten : bool, optional (default False)
        When True (False by default) the `components_` vectors are multiplied
        by the square root of n_samples and then divided by the singular values
        to ensure uncorrelated outputs with unit component-wise variances.
        Whitening will remove some information from the transformed signal
        (the relative variance scales of the components) but can sometime
        improve the predictive accuracy of the downstream estimators by
        making their data respect some hard-wired assumptions.

    random_state : int, RandomState instance, default=None
        Used when ``svd_solver`` == 'arpack' or 'randomized'. Pass an int
        for reproducible results across multiple function calls.

    Attributes
    ----------
    components_ : array, shape (n_components, n_total_features)
        Principal axes in feature space, representing the directions of
        maximum variance in the data. The components are sorted by
        ``explained_variance_``. `n_total_features` is the sum of all
        the number of input features.

    explained_variance_ : array, shape (n_components,)
        The amount of variance explained by each of the selected components.

    explained_variance_ratio_ : array, shape (n_components,)
        Percentage of variance explained by each of the selected components.
        If ``n_components`` is not set then all components are stored and the
        sum of the ratios is equal to 1.0.

    individual_components_ : list of array
        Individual components for each individual PCA.
        `individual_components_[i]` is an array of shape
        (n_individual_components, n_features) where n_features is the number of
        features in the dataset `i`.

    individual_explained_variance_ : list of array
        Individual explained variance for each individual PCA.
        `individual_explained_variance_[i]` is an array of shape
        (n_individual_components, ).

    individual_explained_variance_ratio_ : list of array
        Individual explained variance ratio for each individual PCA.
        `individual_explained_variance_ratio_[i]` is an array of shape
        (n_individual_components, ) where n_features is the number of
        features in the dataset `i`.

    individual_mean_ : list of array
        Mean of each dataset, estimated on the training data
        `individual_mean_[i]` is an array of shape
        (n_features) where n_features is the number of
        features in the dataset `i`.

    individual_projections_ : list of array
        List containing the linear transform linking the dataset to the output.
        Xs[i].dot(individual_projections_[i].T) gives the estimated reduced
        dataset for view i. This is obtained by least squares estimation.

    individual_embeddings_ : list of array
       List containing the pseudo-inverses of individual_projections_.
       Allows to recover each original dataset from reduced data.

    n_components_ : int
        The estimated number of components.

    n_features_ : list of int
        Number of features in each training dataset.

    n_samples_ : int
        Number of samples in the training data.

    n_views_ : int
        Number of views in the training data

    References
    ----------
    .. [#1grouppca] Vince D Calhoun, et al.
            "A method for making group inferences from
            functional MRI data using independent component analysis."
            Human Brain Mapping, 14(3):140–151, 2001.

    Examples
    --------
    >>> from mvlearn.datasets import load_UCImultifeature
    >>> from mvlearn.decomposition import GroupPCA
    >>> Xs, _ = load_UCImultifeature()
    >>> pca = GroupPCA(n_components=3)
    >>> Xs_transformed = pca.fit_transform(Xs)
    >>> print(len(Xs_transformed))
    6
    >>> print(Xs_transformed[0].shape)
    (2000, 3)
    """

    def __init__(
        self,
        n_components=None,
        n_individual_components="auto",
        multiview_output=True,
        prewhiten=False,
        whiten=False,
        random_state=None,
    ):
        self.n_components = n_components
        self.n_individual_components = n_individual_components
        self.multiview_output = multiview_output
        self.prewhiten = prewhiten
        self.whiten = whiten
        self.random_state = random_state

    def fit(self, Xs, y=None):
        """Fit  to the data.

        This merges datasets together and reduces the dimensionality.

        Parameters
        ----------
        Xs : list of array-likes or numpy.ndarray
            - Xs length: n_views
            - Xs[i] shape: (n_samples, n_features_i)

        y : None
            Ignored variable.

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        X_transformed, n_views, n_samples, n_features = check_Xs(
            Xs, copy=True, return_dimensions=True
        )
        self.n_views_ = n_views
        self.n_features_ = n_features
        self.n_samples_ = n_samples

        if self.n_components is None:
            self.n_components_ = min(n_features)
        else:
            self.n_components_ = self.n_components
        if self.n_individual_components == "auto":
            self.n_individual_components_ = min(
                self.n_components_, min(n_features)
            )
        else:
            self.n_individual_components_ = self.n_individual_components
        if self.n_individual_components_ is None and self.prewhiten:
            # Still need to whiten data
            self.n_individual_components_ = self.n_features_
        self.individual_pca_ = self.n_individual_components_ is not None
        self.individual_mean_ = [np.mean(X, axis=0) for X in Xs]
        if self.individual_pca_:
            if type(self.n_individual_components_) == int:
                one_dimension = True
            else:
                one_dimension = False
            self.individual_components_ = []
            self.individual_explained_variance_ = []
            self.individual_explained_variance_ratio_ = []
            for i, X in enumerate(Xs):
                if one_dimension:
                    dimension = self.n_individual_components_
                else:
                    dimension = self.n_individual_components_[i]
                pca = PCA(
                    dimension,
                    whiten=self.prewhiten,
                    random_state=self.random_state,
                )
                X_transformed[i] = pca.fit_transform(X)
                self.individual_components_.append(pca.components_)
                self.individual_explained_variance_ratio_.append(
                    pca.explained_variance_ratio_
                )
                self.individual_explained_variance_.append(
                    pca.explained_variance_
                )
        X_stack = np.hstack(X_transformed)
        pca = PCA(self.n_components_, whiten=self.whiten)
        X_transformed = pca.fit_transform(X_stack)
        self.individual_projections_ = []
        self.individual_embeddings_ = []
        transformed_pinv = linalg.pinv(X_transformed)
        for X, mean in zip(Xs, self.individual_mean_):
            lstq_solution = np.dot(transformed_pinv, X - mean)
            self.individual_projections_.append(linalg.pinv(lstq_solution).T)
            self.individual_embeddings_.append(lstq_solution.T)
        self.components_ = pca.components_
        self.explained_variance_ = pca.explained_variance_
        self.explained_variance_ratio_ = pca.explained_variance_ratio_
        return self

    def transform(self, Xs, y=None, index=None):
        r"""Apply groupPCA to Xs.

        Xs is projected on the principal components learned
        from the training set.

        Parameters
        ----------
        Xs: list of array-likes
            - Xs shape: (n_views,)
            - Xs[i] shape: (n_samples, n_features_i)

        y : None
            Ignored variable.

        index: int or array-like, default=None
            The index or list of indices of the fitted views to which the
            inputted views correspond. If None, there should be as many
            inputted views as the fitted views and in the same order.
            Note that the index parameter is not available in all methods of
            mvlearn yet.

        Returns
        -------
        X_transformed : list of array-likes or numpy.ndarray
            The transformed data.
            If `multiview_output` is True, it is a list with the estimated
            individual principal components.
            If `multiview_output` is False, it is a single array containing the
            shared principal components.
        """
        check_is_fitted(self)
        Xs = check_Xs(Xs, copy=True)

        if index is None:
            index_ = np.arange(self.n_views_)
        else:
            index_ = np.copy(index)
            index_ = np.atleast_1d(index_)

        multiview_output = [
            np.dot(X - mean, W.T)
            for W, X, mean in (
                zip(
                    [self.individual_projections_[i] for i in index_],
                    Xs,
                    [self.individual_mean_[i] for i in index_],
                )
            )
        ]
        if self.multiview_output:
            return multiview_output

        if index is not None:
            return np.mean(multiview_output, axis=0,)

        if self.individual_pca_:
            for i, (X, mean, components_, explained_variance_) in enumerate(
                zip(
                    Xs,
                    self.individual_mean_,
                    self.individual_components_,
                    self.individual_explained_variance_,
                )
            ):
                X = X - mean
                X_transformed = np.dot(X, components_.T)
                if self.prewhiten:
                    X_transformed /= np.sqrt(explained_variance_)
                Xs[i] = X_transformed
        else:
            Xs = [X - mean for X, mean in zip(Xs, self.individual_mean_)]
        X_stack = np.hstack(Xs)
        X_transformed = np.dot(X_stack, self.components_.T)
        if self.whiten:
            X_transformed /= np.sqrt(self.explained_variance_)
        return X_transformed

    def inverse_transform(self, X_transformed, index=None):
        r"""Recover multiview data from transformed data.

        Returns an array Xs such that the transform of Xs would be
        X_transformed

        Parameters
        ----------
        X_transformed : list of array-likes or numpy.ndarray
            The dataset corresponding to transformed data

        index: int or array-like, default=None
            The index or list of indices of the fitted views to which the
            inputted views correspond. If None, there should be as many
            inputted views as the fitted views and in the same order.
            Note that the index parameter is not available in all methods of
            mvlearn yet.

        Returns
        -------
        Xs : list of arrays
            The recovered individual datasets
        """
        check_is_fitted(self)
        if index is None:
            index_ = np.arange(self.n_views_)
        else:
            index_ = np.copy(index)
            index_ = np.atleast_1d(index)

        if self.multiview_output:
            assert len(X_transformed) == len(index_)
            X_transformed = check_Xs(X_transformed)
            return [
                np.dot(X, A.T) + mean
                for X, A, mean in (
                    zip(
                        X_transformed,
                        [self.individual_embeddings_[i] for i in index_],
                        [self.individual_mean_[i] for i in index_],
                    )
                )
            ]
        if index is not None:
            return [
                np.dot(X_transformed, A.T) + mean
                for A, mean in (
                    zip(
                        [self.individual_embeddings_[i] for i in index_],
                        [self.individual_mean_[i] for i in index_],
                    )
                )
            ]

        # Inverse stacked PCA
        if self.whiten:
            X_t = X_transformed * np.sqrt(self.explained_variance_)
        else:
            X_t = X_transformed
        X_stack = np.dot(X_t, self.components_)

        if self.individual_pca_:
            Xs = []
            cur_p = 0
            for (mean, components_, explained_variance_) in zip(
                self.individual_mean_,
                self.individual_components_,
                self.individual_explained_variance_,
            ):
                n_features_i = components_.shape[0]
                sl = slice(cur_p, cur_p + n_features_i)
                X_i = X_stack[:, sl]
                if self.prewhiten:
                    X_i *= np.sqrt(explained_variance_)
                X_i = np.dot(X_i, components_)
                X_i = X_i + mean
                Xs.append(X_i)
                cur_p += n_features_i
        else:
            Xs = np.split(X_stack, np.cumsum(self.n_features_)[:-1], axis=1)
            Xs = [X + mean for X, mean in zip(Xs, self.individual_mean_)]
        return Xs