Source code for mvlearn.embed.omnibus

# Copyright 2019 NeuroData (http://neurodata.io)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Omnibus embedding for multiview dimensionality reduction.
# Code from the https://github.com/neurodata/graspy package,
# reproduced and shared with permission.

try:
    from graspy.embed import OmnibusEmbed
except ModuleNotFoundError as error:
    msg = (f"Error: {error}. package graspy required for this function. " +
           "Please consult the mvlearn installation instructions at " +
           "https://github.com/mvlearn/mvlearn to correctly install " +
           "this dependency.")
    raise ModuleNotFoundError(msg)

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

from .base import BaseEmbed
from ..utils.utils import check_Xs


[docs]class Omnibus(BaseEmbed):
    """
    Omnibus computes the pairwise distances for each view. Each
    of these matrices is a n x n dissimilarity matrix where n is the number
    of rows in each view. Omnibus embedding [#1Omni]_ is then performed
    over the dissimilarity matrices and the computed embeddings are returned.

    Parameters
    ----------
    n_components : strictly positive int (default = 2)
        Desired dimensionality of output embeddings. See graspy docs for
        additional details.

    distance_metric : string (default = 'euclidean')
        Distance metric used to compute pairwise distances. Metrics must
        be found in sklearn.neighbors.DistanceMetric.

    normalize : string or None (default = 'l1')
        Normalize function to use on views before computing
        pairwise distances. Must be 'l2', 'l1', 'max'
        or None. If None, the distance matrices will not be normalized.

    algorithm : string (default = 'randomized')
        SVD solver to use. Must be 'full', 'randomized', or 'truncated'.
        See graspy docs for details.

    n_iter : positive int (default = 5)
        Number of iterations for randomized SVD solver. See graspy docs for
        details.

    Attributes
    ----------
    embeddings_: list of arrays (default = None)
        List of Omnibus embeddings. One embedding matrix is provided
        per view. If fit() has not been called, embeddings_ is set to
        None.

    Notes
    -----
    From an implementation perspective, omnibus embedding is performed
    using the GrasPy package's implementation graspy.embed.OmnibusEmbed
    for dissimilarity matrices.

    References
    ----------
    .. [#1Omni] https://graspy.neurodata.io/tutorials/embedding/omnibus

    Examples
    --------
    >>> from mvlearn.embed import omnibus
    >>> import numpy as np
    >>> # Create 2 random data views with feature sizes 50 and 100
    >>> view1 = np.random.rand(1000, 50)
    >>> view2 = np.random.rand(1000, 100)
    >>> embedder = omnibus.Omnibus(n_components=3)
    >>> embeddings = embedder.fit_transform([view1, view2])
    >>> view1_hat, view2_hat = embeddings
    >>> print(view1_hat.shape, view2_hat.shape)
    (1000, 3) (1000, 3)
    """

    def __init__(self, n_components=2, distance_metric="euclidean",
                 normalize="l1",
                 algorithm="randomized",
                 n_iter=5):

        super().__init__()
        self.n_components = n_components
        self.normalize = normalize
        self.distance_metric = distance_metric
        self.algorithm = algorithm
        self.n_iter = n_iter
        self._check_params()
        self.embeddings_ = None

    def _check_params(self):
        """
        Checks that Omnibus arguments are valid. A ValueError
        is thrown if any are not. The checks performed are:
            - distance_metric is valid
            - algorithm is valid
            - normalize is valid
            - n_components is positive int
            - n_iter is positive int
        """

        valid_metrics = ['braycurtis', 'canberra',
                         'chebyshev', 'cityblock', 'correlation',
                         'cosine', 'dice', 'euclidean', 'hamming',
                         'jaccard', 'jensenshannon', 'kulsinski',
                         'mahalanobis', 'matching', 'minkowski',
                         'rogerstanimoto', 'russellrao', 'seuclidean',
                         'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']

        valid_algorithms = ["randomized", "full", "truncated"]

        valid_normalize = ["l1", "l2", "max"]

        if self.distance_metric not in valid_metrics:
            raise ValueError("distance_metric must be in \
                             scipy.spatial.distances.pdist.")

        if self.algorithm not in valid_algorithms:
            raise ValueError("algorithm must be 'randomized', \
                            'full', or 'truncated'.")

        if self.normalize is not None and \
           self.normalize not in valid_normalize:
            raise ValueError("normalize must be 'l2', 'l1', or 'max'.")

        if not isinstance(self.n_components, int) or self.n_components <= 0:
            raise ValueError("n_components must be positive int.")

        if not isinstance(self.n_iter, int) or self.n_iter <= 0:
            raise ValueError("n_iter must be positive int.")

    def fit(self, Xs, y=None):
        """
        Fit the model with Xs and apply the embedding on Xs.
        The embeddings are saved as a class attribute.

        Parameters
        ==========
        Xs : list of array-likes or numpy.ndarray
            - Xs length: n_views
            - Xs[i] shape: (n_samples, n_features_i)
            The data to embed based on the prior fit function. Each
            X in Xs will receive its own embedding.
        y : ignored
            Included for API compliance.
        """
        Xs = check_Xs(Xs)
        dissimilarities = []
        for X in Xs:
            if self.normalize is not None:
                X = normalize(X, norm=self.normalize)
            dissimilarity = pairwise_distances(X, metric=self.distance_metric)

            dissimilarities.append(dissimilarity)

        embedder = OmnibusEmbed(n_components=self.n_components,
                                algorithm=self.algorithm,
                                n_iter=self.n_iter)

        self.embeddings_ = embedder.fit_transform(dissimilarities)

    def fit_transform(self, Xs, y=None):
        """
        Fit the model with Xs and apply the embedding on Xs using
        the fit() function. The resulting embeddings are returned.

        Parameters
        ==========
         Xs : list of array-likes or numpy.ndarray
            - Xs length: n_views
            - Xs[i] shape: (n_samples, n_features_i)
            The data to embed based on the prior fit function. Each
            X in Xs will receive its own embedding.
        y : ignored
            Included for API compliance.

        Returns
        =======
        embeddings : list of arrays
            list of (n_samples, n_components) matrices for each X in Xs.
        """

        self.fit(Xs)
        return self.embeddings_