"""Canonical Correlation Analysis"""
# Authors: Ronan Perry, Theodore Lee
# License: MIT
import numpy as np
import numbers
from scipy.stats import f, chi2
from sklearn.utils.validation import check_is_fitted
from .mcca import MCCA, _i_mcca, _mcca_gevp
from ..utils import check_Xs, param_as_list
[docs]class CCA(MCCA):
"""Canonical Correlation Analysis (CCA)
CCA inherits from MultiCCA (MCCA) but is restricted to 2 views which
allows for certain statistics to be computed about the results.
Parameters
----------
n_components : int (default 1)
Number of canonical components to compute and return.
regs : float | 'lw' | 'oas' | None, or list, optional (default None)
CCA regularization for each data view, which can be important
for high dimensional data. A list will specify for each view
separately. If float, must be between 0 and 1 (inclusive).
- 0 or None : corresponds to SUMCORR-AVGVAR MCCA.
- 1 : partial least squares SVD (generalizes to more than 2 views)
- 'lw' : Default ``sklearn.covariance.ledoit_wolf`` regularization
- 'oas' : Default ``sklearn.covariance.oas`` regularization
signal_ranks : int, None or list, optional (default None)
The initial signal rank to compute. If None, will compute the full SVD.
A list will specify for each view separately.
center : bool, or list (default True)
Whether or not to initially mean center the data. A list will specify
for each view separately.
i_mcca_method : 'auto' | 'svd' | 'gevp' (default 'auto')
Whether or not to use the SVD based method (only works with no
regularization) or the gevp based method for informative MCCA.
multiview_output : bool, optional (default True)
If True, the ``.transform`` method returns one dataset per view.
Otherwise, it returns one dataset, of shape (n_samples, n_components)
Attributes
----------
means_ : list of numpy.ndarray
The means of each view, each of shape (n_features,)
loadings_ : list of numpy.ndarray
The loadings for each view used to project new data,
each of shape (n_features_b, n_components).
common_score_norms_ : numpy.ndarray, shape (n_components,)
Column norms of the sum of the fitted view scores.
Used for projecting new data
evals_ : numpy.ndarray, shape (n_components,)
The generalized eigenvalue problem eigenvalues.
n_views_ : int
The number of views
n_features_ : list
The number of features in each fitted view
n_components_ : int
The number of components in each transformed view
See also
--------
MCCA, KMCCA
References
----------
.. [#1cca] Kettenring, J. R., "Canonical Analysis of Several Sets of
Variables." Biometrika, 58:433-451, (1971)
.. [#2cca] Tenenhaus, A., et al. "Regularized generalized canonical
correlation analysis." Psychometrika, 76:257–284, 2011
Examples
--------
>>> from mvlearn.embed import CCA
>>> X1 = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
>>> X2 = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
>>> cca = CCA()
>>> cca.fit([X1, X2])
CCA()
>>> Xs_scores = cca.transform([X1, X2])
"""
def _fit(self, Xs):
"""Helper function for the `.fit` function"""
Xs, self.n_views_, _, self.n_features_ = check_Xs(
Xs, return_dimensions=True
)
if self.n_views_ != 2:
raise ValueError(
f"CCA accepts exactly 2 views but {self.n_views_}"
"were provided. Consider using MCCA for more than 2 views")
if not (isinstance(self.n_components, numbers.Integral) and
1 <= self.n_components <= min(self.n_features_)):
raise ValueError(
"n_components must be an integer in the range"
f"[1, {min(self.n_features_)}]")
centers = param_as_list(self.center, self.n_views_)
self.means_ = [np.mean(X, axis=0) if c else None
for X, c in zip(Xs, centers)]
Xs = [X - m if m is not None else X for X, m in zip(Xs, self.means_)]
if self.signal_ranks is not None:
self.loadings_, scores, common_scores_normed, \
self.common_score_norms_, self.evals_ = _i_mcca(
Xs,
signal_ranks=self.signal_ranks,
n_components=self.n_components,
regs=self.regs,
method=self.i_mcca_method,
)
else:
self.loadings_, scores, common_scores_normed, \
self.common_score_norms_, self.evals_ = _mcca_gevp(
Xs,
n_components=self.n_components,
regs=self.regs
)
return scores, common_scores_normed
def stats(self, scores, stat=None):
r"""
Compute relevant statistics from the fitted CCA.
Parameters
----------
scores: array-like, shape (2, n_samples, n_components)
The CCA scores.
stat : str, optional (default None)
The statistic to return. If None, returns a dictionary of all
statistics. Otherwise, specifies one of the following statistics
- 'r' : numpy.ndarray of shape (n_components,)
Canonical correlations of each component.
- 'Wilks' : numpy.ndarray of shape (n_components,)
Wilks' Lambda likelihood ratio statistic.
- 'df1' : numpy.ndarray of shape (n_components,)
Degrees of freedom for the chi-squared statistic, and
the numerator degrees of freedom for the F statistic.
- 'df2' : numpy.ndarray of shape (n_components,)
Denominator degrees of freedom for the F statistic.
- 'F' : numpy.ndarray of shape (n_components,)
Rao's approximate F statistic for H_0(k).
- 'pF' : numpy.ndarray of shape (n_components,)
Right-tail pvalue for stats['F'].
- 'chisq' : numpy.ndarray of shape (n_components,)
Bartlett's approximate chi-squared statistic for H_0(k)
with Lawley's modification.
- 'pChisq' : numpy.ndarray of shape (n_components,)
Right-tail pvalue for stats['chisq'].
Returns
-------
stats : dict or numpy.ndarray
Dict containing the statistics with keys specified above or
one of the statistics if specified by the `stat` parameter.
"""
check_is_fitted(self)
scores = check_Xs(scores, enforce_views=2)
S1, S2 = scores
assert S1.shape[1] == S2.shape[1], \
"Scores from each view must have the same number of components."
n_components = S1.shape[1]
stats = {}
# pearson correlation coefficient
r = self.canon_corrs(scores)
stats['r'] = r
r = r.squeeze()
# Wilks' Lambda test statistic
d = min([n_components, min(self.n_features_)])
k = np.arange(d)
rank1_k = self.n_features_[0] - k
rank2_k = self.n_features_[1] - k
if r.size > 1:
nondegen = np.argwhere(r < 1 - 2 * np.finfo(float).eps).squeeze()
elif r < 1 - 2 * np.finfo(float).eps:
nondegen = np.array(0, dtype=int)
else:
nondegen = np.array([], dtype=int)
log_lambda = np.NINF * np.ones(n_components,)
if nondegen.size > 0:
if r.size > 1:
log_lambda[nondegen] = np.cumsum(
(np.log(1 - r[nondegen]**2))[::-1])
log_lambda[nondegen] = log_lambda[nondegen][::-1]
else:
log_lambda[nondegen] = np.cumsum(
(np.log(1 - r**2)))
stats['Wilks'] = np.exp(log_lambda)
# Rao's approximation to F distribution.
# default value for cases where the exponent formula fails
s = np.ones(d,)
# cases where (d1k,d2k) not one of (1,2), (2,1), or (2,2)
okCases = np.argwhere(rank1_k*rank2_k > 2).squeeze()
snumer = rank1_k*rank1_k*rank2_k*rank2_k - 4
sdenom = rank1_k*rank1_k + rank2_k*rank2_k - 5
s[okCases] = np.sqrt(np.divide(snumer[okCases], sdenom[okCases]))
# Degrees of freedom for null hypothesis H_0k
stats['df1'] = rank1_k * rank2_k
stats['df2'] = (
S1.shape[0] - .5 * (self.n_features_[0] + self.n_features_[1] + 3)
) * s - (.5 * rank1_k * rank2_k) + 1
# Rao's F statistic
pow_lambda = stats['Wilks']**(1 / s)
ratio = np.inf * np.ones(d,)
ratio[nondegen] = ((1 - pow_lambda[nondegen]) / pow_lambda[nondegen])
stats['F'] = ratio * stats['df2'] / stats['df1']
# Right-tailed pvalue for Rao's F
stats['pF'] = 1 - f.cdf(stats['F'], stats['df1'], stats['df2'])
# Lawley's modification to Bartlett's chi-squared statistic
if r.size == 1:
r = np.array([r])
stats['chisq'] = -log_lambda * (
S1.shape[0] - k -
0.5 * (self.n_features_[0] + self.n_features_[1] + 3) +
np.cumsum(np.hstack((np.zeros(1,), 1 / r[: d-1]))**2))
# Right-tailed pvalue for the Lawley modification to Barlett
stats['pChisq'] = 1 - chi2.cdf(stats['chisq'], stats['df1'])
if stat is None:
return stats
else:
try:
return stats[stat]
except KeyError:
raise KeyError(f"Provided statistic {stat} must be one of"
" the statistics listed in the Parameters.")