Source code for mvlearn.embed.gcca

# License: MIT

from .base import BaseEmbed
from ..utils.utils import check_Xs

import numpy as np
from scipy import linalg, stats
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed
from .utils import select_dimension
import warnings


def center(X):
    r"""
    Subtracts the row means and divides by the row standard deviations.
    Then subtracts column means.

    Parameters
    ----------
    X : array-like, shape (n_observations, n_features)
        The data to preprocess

    Returns
    -------
    centered_X : preprocessed data matrix
    """

    # Mean along rows using sample mean and sample std
    centered_X = stats.zscore(X, axis=1, ddof=1)
    # Mean along columns
    mu = np.mean(centered_X, axis=0)
    centered_X -= mu
    return centered_X


[docs]class GCCA(BaseEmbed): r""" An implementation of Generalized Canonical Correlation Analysis [#1GCCA]_ suitable for cases where the number of features exceeds the number of samples by first applying single view dimensionality reduction. Computes individual projections into a common subspace such that the correlations between pairwise projections are minimized (ie. maximize pairwise correlation). An important note: this is applicable to any number of views, not just two. Parameters ---------- n_components : int (positive), optional, default=None If ``self.sv_tolerance=None``, selects the number of SVD components to keep for each view. If none, another selection method is used. fraction_var : float, default=None If ``self.sv_tolerance=None``, and ``self.n_components=None``, selects the number of SVD components to keep for each view by capturing enough of the variance. If none, another selection method is used. sv_tolerance : float, optional, default=None Selects the number of SVD components to keep for each view by thresholding singular values. If none, another selection method is used. n_elbows : int, optional, default: 2 If ``self.fraction_var=None``, ``self.sv_tolerance=None``, and ``self.n_components=None``, then compute the optimal embedding dimension using :func:`.utils.select_dimension`. Otherwise, ignored. tall : boolean, default=False Set to true if n_samples > n_features, speeds up SVD max_rank : boolean, default=False If true, sets the rank of the common latent space as the maximum rank of the individual spaces. If false, uses the minimum individual rank. n_jobs : int (positive), default=None The number of jobs to run in parallel when computing the SVDs for each view in `fit` and `partial_fit`. `None` means 1 job, `-1` means using all processors. Attributes ---------- projection_mats_ : list of arrays A projection matrix for each view, from the given space to the latent space ranks_ : list of ints Number of left singular vectors kept for each view during the first SVD Notes ----- Consider two views :math:`X_1` and :math:`X_2`. Canonical Correlation Analysis seeks to find vectors :math:`a_1` and :math:`a_2` to maximize the correlation :math:`X_1 a_1` and :math:`X_2 a_2`, expanded below. .. math:: \left(\frac{a_1^TC_{12}a_2} {\sqrt{a_1^TC_{11}a_1a_2^TC_{22}a_2}} \right) where :math:`C_{11}`, :math:`C_{22}`, and :math:`C_{12}` are respectively the view 1, view 2, and between view covariance matrix estimates. GCCA maximizes the sum of these correlations across all pairwise views and computes a set of linearly independent components. This specific algorithm first applies principal component analysis (PCA) independently to each view and then aligns the most informative projections to find correlated and informative subspaces. Parameters that control the embedding dimension apply to the PCA step. The dimension of each aligned subspace is the maximum or minimum of the individual dimensions, per the `max_ranks` parameter. Using the maximum will capture the most information from all views but also noise from some views. Using the minimum will better remove noise dimensions but at the cost of information from some views. References ---------- .. [#1GCCA] B. Afshin-Pour, G.A. Hossein-Zadeh, S.C. Strother, H. Soltanian-Zadeh. "Enhancing reproducibility of fMRI statistical maps using generalized canonical correlation analysis in NPAIRS framework." Neuroimage, volume 60, pp. 1970-1981, 2012 Examples -------- >>> from mvlearn.datasets import load_UCImultifeature >>> from mvlearn.embed import GCCA >>> # Load full dataset, labels not needed >>> Xs, _ = load_UCImultifeature() >>> gcca = GCCA(fraction_var = 0.9) >>> # Transform the first 5 views >>> Xs_latents = gcca.fit_transform(Xs[:5]) >>> print([X.shape[1] for X in Xs_latents]) [9, 9, 9, 9, 9] """ def __init__( self, n_components=None, fraction_var=None, sv_tolerance=None, n_elbows=2, tall=False, max_rank=False, n_jobs=None, ): self.n_components = n_components self.fraction_var = fraction_var self.sv_tolerance = sv_tolerance self.n_elbows = n_elbows self.tall = tall self.projection_mats_ = None self.ranks_ = None self.max_rank = max_rank self.n_jobs = n_jobs def fit(self, Xs, y=None): r""" Calculates a projection from each view to a latent space such that the sum of pairwise latent space correlations is maximized. Each view 'X' is normalized and the left singular vectors of 'X^T X' are calculated using SVD. The number of singular vectors kept is determined by either the percent variance explained, a given rank threshold, or a given number of components. The singular vectors kept are concatenated and SVD of that is taken and used to calculated projections for each view. Parameters ---------- Xs : list of array-likes or numpy.ndarray - Xs length: n_views - Xs[i] shape: (n_samples, n_features_i) The data to fit to. Each view will receive its own embedding. y : ignored Included for API compliance. Returns ------- self : returns an instance of self. """ Xs = check_Xs(Xs, multiview=True) n = Xs[0].shape[0] min_m = min(X.shape[1] for X in Xs) # Parallel center Xs = Parallel(n_jobs=self.n_jobs)( delayed(center)(X) for X in Xs ) # Parallel SVDs usvr = Parallel(n_jobs=self.n_jobs)( delayed(self._fit_view)(X, n, min_m) for X in Xs ) # Reshape from parallel output self._Uall, self._Sall, self._Vall, self.ranks_ = zip(*usvr) self = self._fit_multistep() return self def partial_fit(self, Xs, reset=False, multiview_step=True): r""" Performs like `fit`, but will not overwrite previously fitted single views and instead uses them as well as the new data. Useful if the data needs to be processed in batches. Parameters ---------- Xs : list of array-likes or numpy.ndarray - Xs length: n_views - Xs[i] shape: (n_samples, n_features_i) The data to fit to. Each view will receive its own embedding. reset : boolean (default = False) If True, overwrites all prior computations. multiview_step : boolean, (default = True) If True, performs the joint SVD step on the results from individual views. Must be set to True in the final call. Returns ------- self : returns an instance of self. """ if not hasattr(self, '_Uall') or reset: self._Uall = [] self._Sall = [] self._Vall = [] self.ranks_ = [] Xs = check_Xs(Xs, multiview=False) n = Xs[0].shape[0] min_m = min(X.shape[1] for X in Xs) # Parallel center Xs = Parallel(n_jobs=self.n_jobs)( delayed(center)(X) for X in Xs ) # Parallel SVDs usvr = Parallel(n_jobs=self.n_jobs)( delayed(self._fit_view)(X, n, min_m) for X in Xs ) # Reshape and concatenate from parallel output u, s, v, r = zip(*usvr) self._Uall += u self._Sall += s self._Vall += v self.ranks_ += r if multiview_step: if len(self.ranks_) < 2: msg = "Fewer than two single views fitted. Unable to perform \ multiview step." warnings.warn(msg, UserWarning) else: self = self._fit_multistep() return self def _fit_view(self, X, n, min_m): """ Helper function to compute SVD on each view. """ # Preprocess X[np.isnan(X)] = 0 # compute the SVD of the data if self.tall: v, s, ut = linalg.svd(X.T, full_matrices=False) else: u, s, vt = linalg.svd(X, full_matrices=False) ut = u.T v = vt.T # Dimensions to reduce to if self.sv_tolerance: if not isinstance(self.sv_tolerance, float) and not isinstance( self.sv_tolerance, int ): raise TypeError("sv_tolerance must be numeric") elif self.sv_tolerance <= 0: raise ValueError( "sv_tolerance must be greater than 0" ) rank = sum(s > self.sv_tolerance) elif self.n_components: if not isinstance(self.n_components, int): raise TypeError("n_components must be an integer") elif self.n_components <= 0: raise ValueError( "n_components must be greater than 0" ) elif self.n_components > min((n, min_m)): raise ValueError( "n_components must be less than or equal to the \ minimum input rank" ) rank = self.n_components elif self.fraction_var: if not isinstance(self.fraction_var, float) and not isinstance( self.fraction_var, int ): raise TypeError( "fraction_var must be an integer or float" ) elif self.fraction_var <= 0 or self.fraction_var > 1: raise ValueError("fraction_var must be in (0,1]") s2 = np.square(s) rank = sum(np.cumsum(s2 / sum(s2)) < self.fraction_var) + 1 else: # Sweep over only first log2, else too large elbows s = s[: int(np.ceil(np.log2(np.min(X.shape))))] elbows, _ = select_dimension( s, n_elbows=self.n_elbows, threshold=None ) rank = elbows[-1] u = ut.T[:, :rank] return u, s, v, rank def _fit_multistep(self): """ Helper function to compute the SVD on the results from individuals view SVDs. """ if self.max_rank: d = max(self.ranks_) else: d = min(self.ranks_) # Create a concatenated view of Us Uall_c = np.concatenate(self._Uall, axis=1) _, _, VV = svds(Uall_c, d) VV = np.flip(VV.T, axis=1) VV = VV[:, : min([d, VV.shape[1]])] # SVDS the concatenated Us idx_end = 0 projection_mats = [] n = len(self.ranks_) for i in range(n): idx_start = idx_end idx_end = idx_start + self.ranks_[i] VVi = normalize(VV[idx_start:idx_end, :], "l2", axis=0) # Compute the canonical projections A = np.sqrt(n - 1) * self._Vall[i][:, : self.ranks_[i]] A = A @ (linalg.solve( np.diag(self._Sall[i][: self.ranks_[i]]), VVi )) projection_mats.append(A) self.projection_mats_ = projection_mats return self def transform(self, Xs, view_idx=None): r""" Embeds data matrix(s) using the fitted projection matrices. May be used for out-of-sample embeddings. Parameters ---------- Xs : list of array-likes or numpy.ndarray - Xs length: n_views - Xs[i] shape: (n_samples, n_features_i) A list of data matrices from each view to transform based on the prior fit function. If view_idx is defined, then Xs is a 2D data matrix corresponding to a single view. view_idx : int, default=None For transformation of a single view. If not None, then Xs is 2D and views_idx specifies the index of the view from which Xs comes from. Returns ------- Xs_transformed : list of array-likes or array-like Same shape as Xs """ if self.projection_mats_ is None: raise RuntimeError("Must call fit function before transform") Xs = check_Xs(Xs) if view_idx is not None: return center(Xs[0]) @ self.projection_mats_[view_idx] else: return np.array( [ center(X) @ proj for X, proj in zip(Xs, self.projection_mats_) ] )