"""Group Principal Component Analysis."""
# Authors: Pierre Ablin, Hugo Richard
#
# License: MIT
import numpy as np
from scipy import linalg
from sklearn.decomposition import PCA
from sklearn.utils.validation import check_is_fitted
from ..utils.utils import check_Xs
from .base import BaseDecomposer
[docs]class GroupPCA(BaseDecomposer):
r"""Group Principal Component Analysis.
As an optional preprocessing, each dataset in `Xs` is reduced with
usual PCA. Then, datasets are concatenated in the features direction,
and a PCA is performed on this matrix, yielding a single output dataset.
Linear coefficients linking the output dataset and each view are computed
using least squares estimation, which permits to return one dataset per
view.
Parameters
----------
n_components : int, optional
Number of components to extract. If None, n_components is set to
the minimum number of features in the dataset.
n_individual_components : int or list of int or 'auto', optional
The number of individual components to extract as a preprocessing.
If None, no preprocessing is applied. If an `int`, each dataset
is reduced to this dimension. If a list, the dataset `i` is
reduced to the dimension `n_individual_components[i]`.
If `'auto'`, set to the minimum between n_components and the
smallest number of features in each dataset.
multiview_output : bool, optional (default True)
If True, the `.transform` method returns one dataset per view.
Otherwise, it returns one dataset, of shape (n_samples, n_components)
prewhiten : bool, optional (default False)
Whether the data should be whitened after the original preprocessing.
whiten : bool, optional (default False)
When True (False by default) the `components_` vectors are multiplied
by the square root of n_samples and then divided by the singular values
to ensure uncorrelated outputs with unit component-wise variances.
Whitening will remove some information from the transformed signal
(the relative variance scales of the components) but can sometime
improve the predictive accuracy of the downstream estimators by
making their data respect some hard-wired assumptions.
random_state : int, RandomState instance, default=None
Used when ``svd_solver`` == 'arpack' or 'randomized'. Pass an int
for reproducible results across multiple function calls.
Attributes
----------
components_ : array, shape (n_components, n_total_features)
Principal axes in feature space, representing the directions of
maximum variance in the data. The components are sorted by
``explained_variance_``. `n_total_features` is the sum of all
the number of input features.
explained_variance_ : array, shape (n_components,)
The amount of variance explained by each of the selected components.
explained_variance_ratio_ : array, shape (n_components,)
Percentage of variance explained by each of the selected components.
If ``n_components`` is not set then all components are stored and the
sum of the ratios is equal to 1.0.
individual_components_ : list of array
Individual components for each individual PCA.
`individual_components_[i]` is an array of shape
(n_individual_components, n_features) where n_features is the number of
features in the dataset `i`.
individual_explained_variance_ : list of array
Individual explained variance for each individual PCA.
`individual_explained_variance_[i]` is an array of shape
(n_individual_components, ).
individual_explained_variance_ratio_ : list of array
Individual explained variance ratio for each individual PCA.
`individual_explained_variance_ratio_[i]` is an array of shape
(n_individual_components, ) where n_features is the number of
features in the dataset `i`.
individual_mean_ : list of array
Mean of each dataset, estimated on the training data
`individual_mean_[i]` is an array of shape
(n_features) where n_features is the number of
features in the dataset `i`.
individual_projections_ : list of array
List containing the linear transform linking the dataset to the output.
Xs[i].dot(individual_projections_[i].T) gives the estimated reduced
dataset for view i. This is obtained by least squares estimation.
individual_embeddings_ : list of array
List containing the pseudo-inverses of individual_projections_.
Allows to recover each original dataset from reduced data.
n_components_ : int
The estimated number of components.
n_features_ : list of int
Number of features in each training dataset.
n_samples_ : int
Number of samples in the training data.
n_views_ : int
Number of views in the training data
References
----------
.. [#1grouppca] Vince D Calhoun, et al.
"A method for making group inferences from
functional MRI data using independent component analysis."
Human Brain Mapping, 14(3):140–151, 2001.
Examples
--------
>>> from mvlearn.datasets import load_UCImultifeature
>>> from mvlearn.decomposition import GroupPCA
>>> Xs, _ = load_UCImultifeature()
>>> pca = GroupPCA(n_components=3)
>>> Xs_transformed = pca.fit_transform(Xs)
>>> print(len(Xs_transformed))
6
>>> print(Xs_transformed[0].shape)
(2000, 3)
"""
def __init__(
self,
n_components=None,
n_individual_components="auto",
multiview_output=True,
prewhiten=False,
whiten=False,
random_state=None,
):
self.n_components = n_components
self.n_individual_components = n_individual_components
self.multiview_output = multiview_output
self.prewhiten = prewhiten
self.whiten = whiten
self.random_state = random_state
def fit(self, Xs, y=None):
"""Fit to the data.
This merges datasets together and reduces the dimensionality.
Parameters
----------
Xs : list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
y : None
Ignored variable.
Returns
-------
self : object
Returns the instance itself.
"""
X_transformed, n_views, n_samples, n_features = check_Xs(
Xs, copy=True, return_dimensions=True
)
self.n_views_ = n_views
self.n_features_ = n_features
self.n_samples_ = n_samples
if self.n_components is None:
self.n_components_ = min(n_features)
else:
self.n_components_ = self.n_components
if self.n_individual_components == "auto":
self.n_individual_components_ = min(
self.n_components_, min(n_features)
)
else:
self.n_individual_components_ = self.n_individual_components
if self.n_individual_components_ is None and self.prewhiten:
# Still need to whiten data
self.n_individual_components_ = self.n_features_
self.individual_pca_ = self.n_individual_components_ is not None
self.individual_mean_ = [np.mean(X, axis=0) for X in Xs]
if self.individual_pca_:
if type(self.n_individual_components_) == int:
one_dimension = True
else:
one_dimension = False
self.individual_components_ = []
self.individual_explained_variance_ = []
self.individual_explained_variance_ratio_ = []
for i, X in enumerate(Xs):
if one_dimension:
dimension = self.n_individual_components_
else:
dimension = self.n_individual_components_[i]
pca = PCA(
dimension,
whiten=self.prewhiten,
random_state=self.random_state,
)
X_transformed[i] = pca.fit_transform(X)
self.individual_components_.append(pca.components_)
self.individual_explained_variance_ratio_.append(
pca.explained_variance_ratio_
)
self.individual_explained_variance_.append(
pca.explained_variance_
)
X_stack = np.hstack(X_transformed)
pca = PCA(self.n_components_, whiten=self.whiten)
X_transformed = pca.fit_transform(X_stack)
self.individual_projections_ = []
self.individual_embeddings_ = []
transformed_pinv = linalg.pinv(X_transformed)
for X, mean in zip(Xs, self.individual_mean_):
lstq_solution = np.dot(transformed_pinv, X - mean)
self.individual_projections_.append(linalg.pinv(lstq_solution).T)
self.individual_embeddings_.append(lstq_solution.T)
self.components_ = pca.components_
self.explained_variance_ = pca.explained_variance_
self.explained_variance_ratio_ = pca.explained_variance_ratio_
return self
def transform(self, Xs, y=None, index=None):
r"""Apply groupPCA to Xs.
Xs is projected on the principal components learned
from the training set.
Parameters
----------
Xs: list of array-likes
- Xs shape: (n_views,)
- Xs[i] shape: (n_samples, n_features_i)
y : None
Ignored variable.
index: int or array-like, default=None
The index or list of indices of the fitted views to which the
inputted views correspond. If None, there should be as many
inputted views as the fitted views and in the same order.
Note that the index parameter is not available in all methods of
mvlearn yet.
Returns
-------
X_transformed : list of array-likes or numpy.ndarray
The transformed data.
If `multiview_output` is True, it is a list with the estimated
individual principal components.
If `multiview_output` is False, it is a single array containing the
shared principal components.
"""
check_is_fitted(self)
Xs = check_Xs(Xs, copy=True)
if index is None:
index_ = np.arange(self.n_views_)
else:
index_ = np.copy(index)
index_ = np.atleast_1d(index_)
multiview_output = [
np.dot(X - mean, W.T)
for W, X, mean in (
zip(
[self.individual_projections_[i] for i in index_],
Xs,
[self.individual_mean_[i] for i in index_],
)
)
]
if self.multiview_output:
return multiview_output
if index is not None:
return np.mean(multiview_output, axis=0,)
if self.individual_pca_:
for i, (X, mean, components_, explained_variance_) in enumerate(
zip(
Xs,
self.individual_mean_,
self.individual_components_,
self.individual_explained_variance_,
)
):
X = X - mean
X_transformed = np.dot(X, components_.T)
if self.prewhiten:
X_transformed /= np.sqrt(explained_variance_)
Xs[i] = X_transformed
else:
Xs = [X - mean for X, mean in zip(Xs, self.individual_mean_)]
X_stack = np.hstack(Xs)
X_transformed = np.dot(X_stack, self.components_.T)
if self.whiten:
X_transformed /= np.sqrt(self.explained_variance_)
return X_transformed
def inverse_transform(self, X_transformed, index=None):
r"""Recover multiview data from transformed data.
Returns an array Xs such that the transform of Xs would be
X_transformed
Parameters
----------
X_transformed : list of array-likes or numpy.ndarray
The dataset corresponding to transformed data
index: int or array-like, default=None
The index or list of indices of the fitted views to which the
inputted views correspond. If None, there should be as many
inputted views as the fitted views and in the same order.
Note that the index parameter is not available in all methods of
mvlearn yet.
Returns
-------
Xs : list of arrays
The recovered individual datasets
"""
check_is_fitted(self)
if index is None:
index_ = np.arange(self.n_views_)
else:
index_ = np.copy(index)
index_ = np.atleast_1d(index)
if self.multiview_output:
assert len(X_transformed) == len(index_)
X_transformed = check_Xs(X_transformed)
return [
np.dot(X, A.T) + mean
for X, A, mean in (
zip(
X_transformed,
[self.individual_embeddings_[i] for i in index_],
[self.individual_mean_[i] for i in index_],
)
)
]
if index is not None:
return [
np.dot(X_transformed, A.T) + mean
for A, mean in (
zip(
[self.individual_embeddings_[i] for i in index_],
[self.individual_mean_[i] for i in index_],
)
)
]
# Inverse stacked PCA
if self.whiten:
X_t = X_transformed * np.sqrt(self.explained_variance_)
else:
X_t = X_transformed
X_stack = np.dot(X_t, self.components_)
if self.individual_pca_:
Xs = []
cur_p = 0
for (mean, components_, explained_variance_) in zip(
self.individual_mean_,
self.individual_components_,
self.individual_explained_variance_,
):
n_features_i = components_.shape[0]
sl = slice(cur_p, cur_p + n_features_i)
X_i = X_stack[:, sl]
if self.prewhiten:
X_i *= np.sqrt(explained_variance_)
X_i = np.dot(X_i, components_)
X_i = X_i + mean
Xs.append(X_i)
cur_p += n_features_i
else:
Xs = np.split(X_stack, np.cumsum(self.n_features_)[:-1], axis=1)
Xs = [X + mean for X, mean in zip(Xs, self.individual_mean_)]
return Xs